直接上代码
#coding=UTF-8
from urllib.request import urlopen
from pyquery import PyQuery as pq
import re
import pymongo
import threading
client=pymongo.MongoClient(host='localhost',port=27017)
table=client.taobao.mutitry47160
lock=threading.Lock()
i = 1def save_to_mongo(result):try:if table.insert(result):print('存储到Mongo成功')except Exception:print('存储到Mongo失败',result)def download_son(Son_link,l):Sonson_link='https://www.7160.com'+Son_link+'index_'+str(l)+'.html'doc3=pq(Sonson_link,encoding='gbk')image_son=doc3('.picsbox.picsboxcenter p a img').attr('src')title_son=doc3('.picsbox.picsboxcenter p a img').attr('alt')product2={'image':image_son,'title':title_son}print(product2)save_to_mongo(product2)def father_link():lock.acquire()global ii += 1lock.release()url='https://www.7160.com/rentiyishu/list_1_'+str(i)+'.html'print('--------------------------------------'+str(i)+'--------------------------------------')doc=pq(url,encoding='gbk')items=doc('.news_bom-left li').items()for item in items:Son_link=item.find('a').attr('href')doc2=pq('https://www.7160.com'+Son_link,encoding='gbk')image_main=doc2('.picsbox.picsboxcenter p a img').attr('src')title_main=doc2('.picsbox.picsboxcenter p a img').attr('alt')product={'image':image_main,'title':title_main}save_to_mongo(product)#获取页码page_num=doc2('body > div > div.center > div.NEWS > div.picmainer > div.itempage > a:nth-child(1)').text()page_num=re.findall(r"\d+\.?\d*",page_num)try:page_num=int(page_num[0])print('共%d页,开始爬取'%page_num)for l in range(2,page_num+1):r1=threading.Thread(target=download_son,args=(Son_link,l))r1.start()except Exception:pass
def main():for i in range(1,108):t1=threading.Thread(target=father_link)t1.start()if __name__ == '__main__':main()