import scrapy
from ..items import PfdsjItemclass PfsjSpider(scrapy.Spider):name = 'pfsj'#allowed_domains = ['xxx.com']start_urls = ['https://www.pingfandeshijie.net']#获取一级页面中的a链接地址def parse(self,response):#获取目录链接a_href=response.xpath("//center/table/tr/td/center/h2/a/@href").extract()# print(a_href)for v in a_href:# print(v)# 将返回的a链接交给调度器进行处理,将处理的结果传递给two_parseyield scrapy.Request(url=v,callback=self.two_parse)# 获取二级页面中的a链接地址def two_parse(self,respond):# print(respond)# 获取a链接a_href=respond.xpath('//div[@class="main"]/div[2]/ul/li/a/@href').extract()# print(a_href)for i in a_href:# 将返回的a链接交给调度器进行处理,将处理的结果传递给three_parseyield scrapy.Request(url=i,callback=self.three_parse)# 获取三级页面中的a链接地址def three_parse(self,respond):# print(type(book_name))page=respond.xpath('/html/body/div[3]/h1/text()').get().split()part=page[0]if len(page)>1:page_num=page[1]else:page_num = page[0]content=respond.xpath('//body/div[3]/div[2]/p/text()').extract()content='\n'.join(content)# print(content)item = PfdsjItem()# 给KugouItem对象属性赋值item['page_num'] = page_numitem['part'] = partitem['content'] = content.replace('\\u300', '')yield item
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.htmlimport os
# useful for handling different item types with a single interface
from itemadapter import ItemAdapterclass PfdsjPipeline:# 第一次执行管道类文件的时候执行一次def open_spider(self,spider):dirname = './平凡的世界'if not os.path.exists(dirname):os.mkdir(dirname)def process_item(self, item, spider):dirname = './%s/'%('平凡的世界') + item['part']if not os.path.exists(dirname):os.mkdir(dirname)# 章节名/章节数——标题filename = "./%s/%s/%s" % ('平凡的世界',item['part'],item['page_num'])with open(filename + '.txt', 'a', encoding='utf-8') as f:f.write(item['content'])
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass PfdsjItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()book_name = scrapy.Field()part = scrapy.Field()page_num = scrapy.Field()content=scrapy.Field()
所需第三方库:scrapy库
运行结果: