文章目录
- 1、说明
- 2、普通单线程代码
- 3、使用多线程和队列的代码
1、说明
糗事百科中段子以文字为主, 爬取时,列表分两种情况
- 内容很少,在列表中就能获取全部内容;
- 内容较多,列表中只显示部分,点击 “查看全文”,在新页面中查看全部内容。
2、普通单线程代码
import requests
from lxml import etree
import pprintclass QiubaiSpider:def __init__(self):self.url_temp="https://www.qiushibaike.com/text/page/{}/"self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}def get_url_list(self):return [self.url_temp.format(i) for i in range(1,14) ]def parse_url(self,url):response =requests.get(url,headers=self.headers)return response.content.decode()def get_content_list(self,html_str):# print(requests.utils.unquote(html_str))html = etree.HTML(html_str)div_list=html.xpath("//div[contains(@class,'col1')]/div[contains(@class,'article')]") # 分组item_list=[]for div in div_list:item={}#print("div============",etree.tostring(div))ccontent=div.xpath(".//div[@class='content']/span/text()")#print(ccontent)item["content"]='\n\n'.join(ccontent)if item["content"].find("查看全文")>1:suffix = div.xpath(".//a[@class='contentHerf']/@href")[0]url_tmp='{}{}'.format("https://www.qiushibaike.com",suffix)item["content"]=self.get_content_all(url_tmp)item["remark"]="========================拼接的======================"gender=div.xpath(".//div[contains(@class,'articleGender')]/@class")[0]item["gender"]=gender.split(" ")[-1].replace("Icon","") if len(gender) >0 else Noneitem_list.append(item)pprint.pprint(item_list)def get_content_all(self,url):html_str = self.parse_url(url)html = etree.HTML(html_str)content ='\n\n'.join( html.xpath("//div[@class='content']/text()"))return contentdef run(self):url_list=self.get_url_list()page=0for url in url_list:page=page+1print("-"*20,"第 ",page," 页","-"*20)html_str =self.parse_url(url)self.get_content_list(html_str)print("="*20,"结束,共 ",page," 页","="*20)if __name__=="__main__":qiubaiSpider=QiubaiSpider()qiubaiSpider.run()
3、使用多线程和队列的代码
在上面代码的基础上,优化程序,增加线程、队列,代码如下:
import requests
from lxml import etree
import pprint
from queue import Queue
import threadingclass QiubaiSpider:def __init__(self):self.url_temp="https://www.qiushibaike.com/text/page/{}/"self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}self.url_queue = Queue()self.html_queue = Queue()self.content_queue = Queue()def get_url_list(self):print("get_url_list")for i in range(1,14):self.url_queue.put(self.url_temp.format(i))def parse_url(self):print("parse_url")while True:url=self.url_queue.get()print(url)response =requests.get(url,headers=self.headers)self.html_queue.put(response.content.decode())self.url_queue.task_done()def get_content_list(self):print("get_content_list")while True:# print(requests.utils.unquote(html_str))html_str=self.html_queue.get()html = etree.HTML(html_str)div_list=html.xpath("//div[contains(@class,'col1')]/div[contains(@class,'article')]") # 分组item_list=[]for div in div_list:item={}#print("div============",etree.tostring(div))ccontent=div.xpath(".//div[@class='content']/span/text()")#print(ccontent)item["content"]='\n\n'.join(ccontent)if item["content"].find("查看全文")>1:suffix = div.xpath(".//a[@class='contentHerf']/@href")[0]url_tmp='{}{}'.format("https://www.qiushibaike.com",suffix)item["content"]=self.get_content_all(url_tmp)item["remark"]="========================拼接的======================"gender=div.xpath(".//div[contains(@class,'articleGender')]/@class")[0]item["gender"]=gender.split(" ")[-1].replace("Icon","") if len(gender) >0 else Noneitem_list.append(item)#pprint.pprint(item_list)self.content_queue.put(item_list)self.html_queue.task_done()def save_content(self):print("save_content")while True:content_list = self.content_queue.get()print(content_list)for i in content_list:print(i)self.content_queue.task_done()def get_content_all(self,url):response = requests.get(url, headers=self.headers)html_str= response.content.decode()html = etree.HTML(html_str)content ='\n\n'.join( html.xpath("//div[@class='content']/text()"))return contentdef run(self):thread_list=[]t_url= threading.Thread(target=self.get_url_list)thread_list.append(t_url)for i in range(10):t_parse = threading.Thread(target=self.parse_url)thread_list.append(t_parse)for i in range(5):t_content=threading.Thread(target=self.get_content_list)thread_list.append(t_content)t_save=threading.Thread(target=self.save_content)thread_list.append(t_save)for t in thread_list:t.setDaemon(True)t.start()for q in [self.url_queue,self.html_queue,self.content_queue]:q.join()print("主线程结束")if __name__=="__main__":qiubaiSpider=QiubaiSpider()qiubaiSpider.run()