python爬去电影天堂恐怖片+游戏

1、爬去方式python+selenium

2、工作流程

selenium自动输入，自动爬取，建立文件夹，存入磁力链接到记事本

3、贴上代码

#!/usr/bin/Python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import os
import urllib2
import time
import random
import re
browser = webdriver.Chrome()
#browser.set_window_position(20, 40)
#browser.set_window_size(1100, 700)
browser.maximize_window() #最大化
#隐式等待
browser.implicitly_wait(10)
browser.get('http://www.dytt8.net/')
browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').clear()
browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').send_keys(u'恐怖')def close(browser):# 获取当前窗口句柄（窗口A）handle = browser.current_window_handle# 获取当前所有窗口句柄（窗口A、B）handles = browser.window_handles# 对窗口进行遍历for newhandle in handles:# 筛选新打开的窗口Bif newhandle!=handle:browser.switch_to_window(newhandle)# 关闭当前窗口B
            browser.close()browser.switch_to_window(handles[0])def change(browser):# 获取当前窗口句柄（窗口A）handle = browser.current_window_handle# 获取当前所有窗口句柄（窗口A、B）handles = browser.window_handles# 对窗口进行遍历for newhandle in handles:# 筛选新打开的窗口Bif newhandle!=handle:browser.switch_to_window(newhandle)def back(browser):# 获取当前窗口句柄（窗口A）handle = browser.current_window_handle# 获取当前所有窗口句柄（窗口A、B）handles = browser.window_handles# 对窗口进行遍历for newhandle in handles:# 筛选新打开的窗口Bif newhandle==handle:browser.switch_to_window(newhandle)# 关闭当前窗口B
            browser.close()browser.switch_to_window(handles[0])def backN(browser):# 获取当前窗口句柄（窗口A）handle = browser.current_window_handle# 获取当前所有窗口句柄（窗口A、B）handles = browser.window_handles# 对窗口进行遍历for newhandle in handles:# 筛选新打开的窗口Bif newhandle!=handle:browser.switch_to_window(newhandle)# 关闭当前窗口B
            browser.close()browser.switch_to_window(handles[1])close(browser)
browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').clear()
browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').send_keys(u'恐怖')
ele = browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[2]/input')
##直接点击不了
#模拟点击事件
ele.send_keys(Keys.ENTER)
##成功跳转到页面
obj = BeautifulSoup(browser.page_source, 'html.parser')def fun(obj, num):list = []list = obj.find('div',{'class':'co_content8'}).find_all('table')for i, v in enumerate(list):if i<=9:href = v.find('a').get('href')title = v.find('a').text##去掉特殊的符号title  = re.sub('[\/:*?"<>|]','-',title)disk_url = 'E:/test/dytt/bt/'+title+''#开始创建文件夹if os.path.exists('E:/test/dytt/bt/'+title+''):print 'This folder already exists!'else:os.mkdir(r'E:/test/dytt/bt/'+title+'')print title#url = 'http://www.ygdy8.com'+href+''###打开一个新窗口js = " window.open('http://www.ygdy8.com"+href+"')"browser.execute_script(js) ##跳转到新页面#browser.get(url)#切换到b窗口
            change(browser)#右键点击那个链接try:qqq =browser.find_element_by_xpath('//*[@id="Zoom"]/span/table/tbody/tr/td/a')ActionChains(browser).context_click(qqq).perform()hrefs = browser.find_element_by_xpath('//*[@id="Zoom"]/span/table/tbody/tr/td/a').get_attribute('href')print hrefsfile = open('E:\\test\\dytt\\bt\\'+title+'\\bt.txt', 'w')file.write(hrefs)file.close()except:print 'WE can try another way!'try:qqq =browser.find_element_by_xpath('//*[@id="Zoom"]/span/div[5]/table/tbody/tr/td/a')ActionChains(browser).context_click(qqq).perform()hrefs = browser.find_element_by_xpath('//*[@id="Zoom"]/span/div[5]/table/tbody/tr/td/a').get_attribute('href')print hrefsfile = open('E:\\test\\dytt\\bt\\'+title+'\\bt.txt', 'w')file.write(hrefs)file.close()except:print 'This is a game!'back(browser)        #循环完之后if num==0:browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[3]/div[2]/div[2]/div[2]/ul/table[11]/tbody/tr/td[9]/a').click()else:browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[3]/div[2]/div[2]/div[2]/ul/table[11]/tbody/tr/td[10]/a').click()change(browser)backN(browser)obj = BeautifulSoup(browser.page_source, 'html.parser')fun(obj, 1)def get_html(url):'''获取html'''##定义headersuser_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"headers={"User-Agent":user_agent}request = urllib2.Request(url, headers=headers)#request.encoding = 'utf-8'try:html = urllib2.urlopen(request).read()except urllib2.URLError as e:print url+'Download error:', e.reasonhtml = Nonereturn htmlfun(obj, 0)