完成对笑话大全http://xiaohua.zol.com.cn中所有笑话内容的爬取,并保存在mysql或mongodb中
#要求字段至少包括笑话分类,笑话来源,笑话标题,笑话内容,笑话url
import re,time,random
import requests
import pymysql
from lxml import etree
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'}
#获取笑话分类url '//div[@class="filter-links clearfix"]/a/@href'
new_url='http://xiaohua.zol.com.cn/new/'
response=requests.get(new_url,headers=headers).text
html2=etree.HTML(response)
# 获取所有笑话分类
fl_urls=html2.xpath('//div[@class="filter-links clearfix"]/a/@href')
xhfl=html2.xpath('//div[@class="filter-links clearfix"]/a/text()')
xhfl=xhfl[1:]
# print(len(xhfl),xhfl)
#获取笑话分类所有url
fl_urls_list=[]
for fl_url in fl_urls:fl_urls_list.append('http://xiaohua.zol.com.cn'+fl_url)
fenlei_urls=fl_urls_list[1:]
# print(len(fenlei_urls),fenlei_urls)#遍历每个笑话分类
for w in range(len(fenlei_urls)):print('正在打印:%s'%xhfl[w])try:# url = 'http://xiaohua.zol.com.cn/lengxiaohua/'r = requests.get(fenlei_urls[w],headers=headers).text# print(r)html = etree.HTML(r)# print(html)#获取笑话每页中的urlurls = html.xpath('//ul[@class="article-list"]/li/div[3]/a/@href')# '//ul[@class="article-list"]/li/span[2]/a/@href'url_list1=[]for url in urls:url_list1.append('http://xiaohua.zol.com.cn'+url)print(len(url_list1),url_list1)for n in range(len(url_list1)):try:print('正在打印第{}页'.format(n+1))url1 = '{}{}.html'.format(fenlei_urls[w],n+1) #第一个中括号获取每个分类url,第二个中括号获取每页urlr = requests.get(url1, headers=headers).text# print(r)html = etree.HTML(r)# 获取笑话标题biaoti = html.xpath('//ul[@class="article-list"]/li/span/a/text()')print(len(biaoti),biaoti)# 获取每个笑话的来源laiyuan = html.xpath('//ul[@class="article-list"]/li/div[@class="article-source"]/span[2]//text() | //ul[@class="article-list"]/li/div[@class="article-source"]/a//text()')print(len(laiyuan),laiyuan)#获取每页url链接urls = html.xpath('//ul[@class="article-list"]/li/div[3]/a/@href')url_list1 = []for url in urls:url_list1.append('http://xiaohua.zol.com.cn' + url)print(len(url_list1), url_list1)#-----获取笑话大全每页每个链接中的笑话内容xhnr_list=[]# info_list=[]for j in range(len(url_list1)):r1 = requests.get(url_list1[j],headers=headers).texthtml1 = etree.HTML(r1)xhnr1 = html1.xpath('//div[@class = "article-text"]//text()')# print(xhnr1)xhnr2=[' '.join([i.strip() for i in xh.strip().split('\t')]) for xh in xhnr1]# print(xhnr2)xhnr = [i for i in xhnr2 if len(str(i))!=0]# print(xhnr)xhnr_list.append(xhnr)print(xhnr_list)#连接数据库(使用的函数:pymysql.connect)conn=pymysql.connect(host="localhost",user='root',password='123456',db='51job',charset='utf8')#连接完数据库后,使用db.cursor()获取数据库的操作游标cursor=conn.cursor()#按行插入insert_sql="insert into xhdq (xhfl,laiyuan,biaoti,xhnr,xhurl)values(%s,%s,%s,%s,%s)"for i in range(0,len(url_list1)):cursor.execute(insert_sql,(str(xhfl[w]),str(laiyuan[i]),str(biaoti[i]),str(xhnr_list[i]),str(url_list1[i])))conn.commit()cursor.close()conn.close()except Exception as e:print("请求出现错误,错误类型是%s" % e)except Exception as e:print("请求出现错误,错误类型是%s" % e)