1、爬去方式python+selenium
2、工作流程
selenium自动输入,自动爬取,建立文件夹,存入磁力链接到记事本
3、贴上代码
#!/usr/bin/Python # -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains from bs4 import BeautifulSoup import os import urllib2 import time import random import re browser = webdriver.Chrome() #browser.set_window_position(20, 40) #browser.set_window_size(1100, 700) browser.maximize_window() #最大化 #隐式等待 browser.implicitly_wait(10) browser.get('http://www.dytt8.net/') browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').clear() browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').send_keys(u'恐怖')def close(browser):# 获取当前窗口句柄(窗口A)handle = browser.current_window_handle# 获取当前所有窗口句柄(窗口A、B)handles = browser.window_handles# 对窗口进行遍历for newhandle in handles:# 筛选新打开的窗口Bif newhandle!=handle:browser.switch_to_window(newhandle)# 关闭当前窗口B browser.close()browser.switch_to_window(handles[0])def change(browser):# 获取当前窗口句柄(窗口A)handle = browser.current_window_handle# 获取当前所有窗口句柄(窗口A、B)handles = browser.window_handles# 对窗口进行遍历for newhandle in handles:# 筛选新打开的窗口Bif newhandle!=handle:browser.switch_to_window(newhandle)def back(browser):# 获取当前窗口句柄(窗口A)handle = browser.current_window_handle# 获取当前所有窗口句柄(窗口A、B)handles = browser.window_handles# 对窗口进行遍历for newhandle in handles:# 筛选新打开的窗口Bif newhandle==handle:browser.switch_to_window(newhandle)# 关闭当前窗口B browser.close()browser.switch_to_window(handles[0])def backN(browser):# 获取当前窗口句柄(窗口A)handle = browser.current_window_handle# 获取当前所有窗口句柄(窗口A、B)handles = browser.window_handles# 对窗口进行遍历for newhandle in handles:# 筛选新打开的窗口Bif newhandle!=handle:browser.switch_to_window(newhandle)# 关闭当前窗口B browser.close()browser.switch_to_window(handles[1])close(browser) browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').clear() browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').send_keys(u'恐怖') ele = browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[2]/input') ##直接点击不了 #模拟点击事件 ele.send_keys(Keys.ENTER) ##成功跳转到页面 obj = BeautifulSoup(browser.page_source, 'html.parser')def fun(obj, num):list = []list = obj.find('div',{'class':'co_content8'}).find_all('table')for i, v in enumerate(list):if i<=9:href = v.find('a').get('href')title = v.find('a').text##去掉特殊的符号title = re.sub('[\/:*?"<>|]','-',title)disk_url = 'E:/test/dytt/bt/'+title+''#开始创建文件夹if os.path.exists('E:/test/dytt/bt/'+title+''):print 'This folder already exists!'else:os.mkdir(r'E:/test/dytt/bt/'+title+'')print title#url = 'http://www.ygdy8.com'+href+''###打开一个新窗口js = " window.open('http://www.ygdy8.com"+href+"')"browser.execute_script(js) ##跳转到新页面#browser.get(url)#切换到b窗口 change(browser)#右键点击那个链接try:qqq =browser.find_element_by_xpath('//*[@id="Zoom"]/span/table/tbody/tr/td/a')ActionChains(browser).context_click(qqq).perform()hrefs = browser.find_element_by_xpath('//*[@id="Zoom"]/span/table/tbody/tr/td/a').get_attribute('href')print hrefsfile = open('E:\\test\\dytt\\bt\\'+title+'\\bt.txt', 'w')file.write(hrefs)file.close()except:print 'WE can try another way!'try:qqq =browser.find_element_by_xpath('//*[@id="Zoom"]/span/div[5]/table/tbody/tr/td/a')ActionChains(browser).context_click(qqq).perform()hrefs = browser.find_element_by_xpath('//*[@id="Zoom"]/span/div[5]/table/tbody/tr/td/a').get_attribute('href')print hrefsfile = open('E:\\test\\dytt\\bt\\'+title+'\\bt.txt', 'w')file.write(hrefs)file.close()except:print 'This is a game!'back(browser) #循环完之后if num==0:browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[3]/div[2]/div[2]/div[2]/ul/table[11]/tbody/tr/td[9]/a').click()else:browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[3]/div[2]/div[2]/div[2]/ul/table[11]/tbody/tr/td[10]/a').click()change(browser)backN(browser)obj = BeautifulSoup(browser.page_source, 'html.parser')fun(obj, 1)def get_html(url):'''获取html'''##定义headersuser_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"headers={"User-Agent":user_agent}request = urllib2.Request(url, headers=headers)#request.encoding = 'utf-8'try:html = urllib2.urlopen(request).read()except urllib2.URLError as e:print url+'Download error:', e.reasonhtml = Nonereturn htmlfun(obj, 0)