附pyspider安装过程
目标页面 https://bing.gifposter.com/list/new/desc/classic.html?p=1
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2020-05-13 09:56:10
# Project: bingfrom pyspider.libs.base_handler import *
from time import strftime, strptime
import pymysqlclass Handler(BaseHandler):crawl_config = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",}mysql_cli = pymysql.Connect('127.0.0.1','root','mypasswd','bing',3306,charset='utf8')cursor = mysql_cli.cursor()# 每天执行一次on_start@every(minutes=24 * 60)def on_start(self):self.crawl('https://bing.gifposter.com/list/new/desc/classic.html?p=1', callback=self.index_page)# 十天之内不重复爬取@config(age=10 * 24 * 60 * 60)def index_page(self, response):# 抓取下一页for each in response.doc('a.page-btn').items():if each.text() == ">" and each.attr.href != 'javascript:void(0)':self.crawl(each.attr.href, callback=self.index_page)# 保存壁纸内容content_list = []for each in response.doc('.imglist li').items():content = {}content['title'] = each('span').text()base_url = each('img').attr.src # 提取img的src属性# 只保留关键部分,使用时拼接 https://www.bing.com/th?id=OHR.{base_url}_1920x1080.jpg# 20180606之前的bing已删除content['base_url'] = base_url.split('/')[-1].split('_1920x1080.jpg_sm')[0]# 格式转换 'May 13, 2020' --> '20200513'date = each('time').text()content['date'] = strftime('%Y%m%d', strptime(date, '%b %d, %Y'))content_list.append(content)return content_list# return的结果转交给on_resultdef on_result(self, result):# 数据持久化if result:for content in result:try:sql = "INSERT INTO bing(%s) VALUES(%s)" % \(','.join(content.keys()), ','.join(['%s']*len(content)))self.cursor.execute(sql, list(content.values()))self.mysql_cli.commit()print('数据存储成功')except Exception as e:self.mysql_cli.rollback()