最近学习threading库,接触线程部分相关知识。题示 , 保证同机器且爬取深度相同,单线程需要33.72s,多线程仅需1.78s。可见多开线程对于提高I/O密集爬虫的效率,行之有效。
声明:
0.本代码仅限学习交流使用
1.转载附作者ID及原作品链接
2.自觉遵守协议 维护网站权益
单线程全代码 修改path request_header可用:
import re
import requests
import json
import time
def get_detail(url,header,form_data):
response=requests.post(url,headers=header,data=form_data)
response.encoding=response.apparent_encoding
dictionary=json.loads(response.text)
#json格式转换
text=dictionary['value']['hotelListHtml']
#字典取值
hotel_list=text.split('
#切片
dict_list=[]
for each_hotel in hotel_list:
try:
name=re.findall(r'class="info_cn">(.*?)',each_hotel)[0]
price=re.findall(r'(.*?)',each_hotel)[0]
score=re.findall(r'data-score="(.*?)"',each_hotel)[0]
new_dict={}
new_dict["name"]=name
new_dict["price"]=price
new_dict["score"]=score
dict_list.append(new_dict)
except:
continue
return dict_list
def save(detail_list,path):
file=open(path,"a")
for each_detail in detail_list:
name=each_detail["name"]
price=each_detail["price"]
score=each_detail["score"]
file.write(name+","+price+","+score+"\n")
file.close()
#保存
def main():
header={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7',
'Connection':'keep-alive',
'Content-Length':'2273',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie':'CookieGuid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; H5CookieId=db6aea90-999f-4be0-9930-7cfcea066214; _fid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; CitySearchHistory=0101%23%E5%8C%97%E4%BA%AC%23beijing%23; firsttime=1583163644996; SessionGuid=abe14edf-638e-44e5-a26d-94807cbf9e7a; Esid=038b4b4a-fd3a-4c88-bb54-fe60f6a9bf12; com.eLong.CommonService.OrderFromCookieInfo=Orderfromtype=1&Parentid=1000&Status=1&Cookiesdays=30&Coefficient=0.0&Pkid=1105&Priority=9001&Isusefparam=0&Makecomefrom=0&Savecookies=0; fv=pcweb; anti_token=4D71FDF2-2357-461A-984C-56958AE1A7CE; ShHotel=InDate=2020-03-03&CityID=0101&CityNameEN=beijing&CityNameCN=%E5%8C%97%E4%BA%AC&OutDate=2020-03-04&CityName=%E5%8C%97%E4%BA%AC; ext_param=bns%3D4%26ct%3D3; s_cc=true; __tctmc=0.215881358; __tctmc=20377580.26050747; __tctmd=20377580.254392154; __tctma=20377580.1583163637156064.1583163637230.1583163637230.1583200239719.2; __tctmu=20377580.0.0; __tctmz=20377580.1583200239719.2.1.utmccn=(referral)|utmcsr=bing.com|utmcct=|utmcmd=referral; longKey=1583163637156064; __tctrack=0; __tctmd=0.1; lasttime=1583202216497; s_visit=1; User-Ref-SessionId=78bd-e80d-13ad-2b0a-92aa-532a; trace_extend={"deviceid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","appid":"6","userid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","orderfromid":1105,"sessionid":"78bd-e80d-13ad-2b0a-92aa-532a","pvid":"c1b8aaeb"}; __tctmb=0.1284443720926117.1583203761350.1583203761350.1; s_sq=elongcom%3D%2526pid%253Dhotel.elong.com%25252Fbeijing%2526pidt%253D1%2526oid%253Djavascript%25253Avoid(0)%2526ot%253DA; __tccgd=0.0; JSESSIONID=1F53BCB446035ADE53A945505FBF2D47',
'Host':'hotel.elong.com',
'Origin':'http://hotel.elong.com',
'Referer':'http://hotel.elong.com/beijing/',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
form_data={
'code':'7611836',
'listRequest.areaID':'',
'listRequest.bedLargeTypes':'',
'listRequest.bookingChannel':'1',
'listRequest.breakfasts':'0',
'listRequest.cancelFree':'false',
'listRequest.cardNo':'192928',
'listRequest.checkInDate':'2020-03-03 00:00:00',
'listRequest.checkOutDate':'2020-03-04 00:00:00',
'listRequest.cityID':'0101',
'listRequest.cityName':'北京',
'listRequest.crawledFlag':'0',
'listRequest.customLevel':'11',
'listRequest.discountIds':'',
'listRequest.distance':'20000',
'listRequest.endLat':'0',
'listRequest.endLng':'0',
'listRequest.epcCreateOrderGuideVersion':'C',
'listRequest.facilityIds':'',
'listRequest.guokaoFlag':'false',
'listRequest.highPrice':'0',
'listRequest.hotelBrandIDs':'',
'listRequest.hotelIDs':'',
'listRequest.interceptAction':'0',
'listRequest.isAdvanceSave':'false',
'listRequest.isAfterCouponPrice':'true',
'listRequest.isCoupon':'false',
'listRequest.isDebug':'false',
'listRequest.isLimitTime':'false',
'listRequest.isLogin':'false',
'listRequest.isMobileOnly':'true',
'listRequest.isNeed5Discount':'true',
'listRequest.isNeedNotContractedHotel':'false',
'listRequest.isNeedSimilarPrice':'false',
'listRequest.isReturnNoRoomHotel':'true',
'listRequest.isStaySave':'false',
'listRequest.isTrace':'false',
'listRequest.isUnionSite':'false',
'listRequest.isnstantConfirm':'false',
'listRequest.keywords':'',
'listRequest.keywordsType':'0',
'listRequest.language':'cn',
'listRequest.lat':'39.9059093',
'listRequest.listType':'0',
'listRequest.lng':'116.3913489',
'listRequest.lowPrice':'0',
'listRequest.orderFromID':'1105',
'listRequest.pageIndex':'1',
'listRequest.pageSize':'20',
'listRequest.payMethod':'0',
'listRequest.personOfRoom':'0',
'listRequest.poiId':'0',
'listRequest.poiName':'',
'listRequest.productTypes':'1,6,26',
'listRequest.promotionChannelCode':'0000',
'listRequest.promotionSwitch':'-1',
'listRequest.proxyID':'ZD',
'listRequest.rankType':'0',
'listRequest.returnFilterItem':'true',
'listRequest.sectionId':'',
'listRequest.sellChannel':'1',
'listRequest.seoHotelStar':'0',
'listRequest.sortDirection':'1',
'listRequest.sortMethod':'1',
'listRequest.standBack':'-1',
'listRequest.starLevels':'',
'listRequest.startLat':'0',
'listRequest.startLng':'0',
'listRequest.sug_act_info':'',
'listRequest.taRecommend':'false',
'listRequest.themeIds':'',
'listRequest.traceId':'b19bcbae-5495-4ce0-ad0e-3db4778d75e7',
'listRequest.wordId':'',
'listRequest.wordType':'-1',
'listRequest.elongToken':'2cbbf4d7-ed57-43f8-8842-4a74ad939a46',
'listRequest.trace_token':'|*|cityId:101|*|qId:1b8e11de-115a-4191-a0ce-b9e679ae66cb|*|st:city|*|sId:101|*|'
}
pages=10
#爬取深度
url='http://hotel.elong.com/ajax/tmapilist/asyncsearch'
path=r"C:\Users\asus\Desktop\CS\PYTHON\实例\酒店信息.csv"
file=open(path,"a")
file.write("名称"+","+"价格"+","+"评分"+"\n")
file.close()
#打印表头
for page in range(pages):
form_data['listRequest.pageIndex']=str(page)
#动态加载表单
detail_list=get_detail(url,header,form_data)
save(detail_list,path)
if __name__ == "__main__":
time1=time.time()
main()
print("Execute over")
time2=time.time()
print(time2-time1)
#耗时33.72(s)
代码至此结束
import re
import requests
import json
import time
import threading
def get_detail(url,header,form_data,path):
response=requests.post(url,headers=header,data=form_data)
response.encoding=response.apparent_encoding
dictionary=json.loads(response.text)
#json格式转换
text=dictionary['value']['hotelListHtml']
#字典取值
hotel_list=text.split('
#切片
dict_list=[]
for each_hotel in hotel_list:
try:
name=re.findall(r'class="info_cn">(.*?)',each_hotel)[0]
price=re.findall(r'(.*?)',each_hotel)[0]
score=re.findall(r'data-score="(.*?)"',each_hotel)[0]
new_dict={}
new_dict["name"]=name
new_dict["price"]=price
new_dict["score"]=score
dict_list.append(new_dict)
except:
continue
save(dict_list,path)
#函数连接点
def save(detail_list,path):
file=open(path,"a")
for each_detail in detail_list:
name=each_detail["name"]
price=each_detail["price"]
score=each_detail["score"]
file.write(name+","+price+","+score+"\n")
file.close()
#保存
def main():
header={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7',
'Connection':'keep-alive',
'Content-Length':'2273',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie':'CookieGuid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; H5CookieId=db6aea90-999f-4be0-9930-7cfcea066214; _fid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; CitySearchHistory=0101%23%E5%8C%97%E4%BA%AC%23beijing%23; firsttime=1583163644996; SessionGuid=abe14edf-638e-44e5-a26d-94807cbf9e7a; Esid=038b4b4a-fd3a-4c88-bb54-fe60f6a9bf12; com.eLong.CommonService.OrderFromCookieInfo=Orderfromtype=1&Parentid=1000&Status=1&Cookiesdays=30&Coefficient=0.0&Pkid=1105&Priority=9001&Isusefparam=0&Makecomefrom=0&Savecookies=0; fv=pcweb; anti_token=4D71FDF2-2357-461A-984C-56958AE1A7CE; ShHotel=InDate=2020-03-03&CityID=0101&CityNameEN=beijing&CityNameCN=%E5%8C%97%E4%BA%AC&OutDate=2020-03-04&CityName=%E5%8C%97%E4%BA%AC; ext_param=bns%3D4%26ct%3D3; s_cc=true; __tctmc=0.215881358; __tctmc=20377580.26050747; __tctmd=20377580.254392154; __tctma=20377580.1583163637156064.1583163637230.1583163637230.1583200239719.2; __tctmu=20377580.0.0; __tctmz=20377580.1583200239719.2.1.utmccn=(referral)|utmcsr=bing.com|utmcct=|utmcmd=referral; longKey=1583163637156064; __tctrack=0; __tctmd=0.1; lasttime=1583202216497; s_visit=1; User-Ref-SessionId=78bd-e80d-13ad-2b0a-92aa-532a; trace_extend={"deviceid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","appid":"6","userid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","orderfromid":1105,"sessionid":"78bd-e80d-13ad-2b0a-92aa-532a","pvid":"c1b8aaeb"}; __tctmb=0.1284443720926117.1583203761350.1583203761350.1; s_sq=elongcom%3D%2526pid%253Dhotel.elong.com%25252Fbeijing%2526pidt%253D1%2526oid%253Djavascript%25253Avoid(0)%2526ot%253DA; __tccgd=0.0; JSESSIONID=1F53BCB446035ADE53A945505FBF2D47',
'Host':'hotel.elong.com',
'Origin':'http://hotel.elong.com',
'Referer':'http://hotel.elong.com/beijing/',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
form_data={
'code':'7611836',
'listRequest.areaID':'',
'listRequest.bedLargeTypes':'',
'listRequest.bookingChannel':'1',
'listRequest.breakfasts':'0',
'listRequest.cancelFree':'false',
'listRequest.cardNo':'192928',
'listRequest.checkInDate':'2020-03-03 00:00:00',
'listRequest.checkOutDate':'2020-03-04 00:00:00',
'listRequest.cityID':'0101',
'listRequest.cityName':'北京',
'listRequest.crawledFlag':'0',
'listRequest.customLevel':'11',
'listRequest.discountIds':'',
'listRequest.distance':'20000',
'listRequest.endLat':'0',
'listRequest.endLng':'0',
'listRequest.epcCreateOrderGuideVersion':'C',
'listRequest.facilityIds':'',
'listRequest.guokaoFlag':'false',
'listRequest.highPrice':'0',
'listRequest.hotelBrandIDs':'',
'listRequest.hotelIDs':'',
'listRequest.interceptAction':'0',
'listRequest.isAdvanceSave':'false',
'listRequest.isAfterCouponPrice':'true',
'listRequest.isCoupon':'false',
'listRequest.isDebug':'false',
'listRequest.isLimitTime':'false',
'listRequest.isLogin':'false',
'listRequest.isMobileOnly':'true',
'listRequest.isNeed5Discount':'true',
'listRequest.isNeedNotContractedHotel':'false',
'listRequest.isNeedSimilarPrice':'false',
'listRequest.isReturnNoRoomHotel':'true',
'listRequest.isStaySave':'false',
'listRequest.isTrace':'false',
'listRequest.isUnionSite':'false',
'listRequest.isnstantConfirm':'false',
'listRequest.keywords':'',
'listRequest.keywordsType':'0',
'listRequest.language':'cn',
'listRequest.lat':'39.9059093',
'listRequest.listType':'0',
'listRequest.lng':'116.3913489',
'listRequest.lowPrice':'0',
'listRequest.orderFromID':'1105',
'listRequest.pageIndex':'1',
'listRequest.pageSize':'20',
'listRequest.payMethod':'0',
'listRequest.personOfRoom':'0',
'listRequest.poiId':'0',
'listRequest.poiName':'',
'listRequest.productTypes':'1,6,26',
'listRequest.promotionChannelCode':'0000',
'listRequest.promotionSwitch':'-1',
'listRequest.proxyID':'ZD',
'listRequest.rankType':'0',
'listRequest.returnFilterItem':'true',
'listRequest.sectionId':'',
'listRequest.sellChannel':'1',
'listRequest.seoHotelStar':'0',
'listRequest.sortDirection':'1',
'listRequest.sortMethod':'1',
'listRequest.standBack':'-1',
'listRequest.starLevels':'',
'listRequest.startLat':'0',
'listRequest.startLng':'0',
'listRequest.sug_act_info':'',
'listRequest.taRecommend':'false',
'listRequest.themeIds':'',
'listRequest.traceId':'b19bcbae-5495-4ce0-ad0e-3db4778d75e7',
'listRequest.wordId':'',
'listRequest.wordType':'-1',
'listRequest.elongToken':'2cbbf4d7-ed57-43f8-8842-4a74ad939a46',
'listRequest.trace_token':'|*|cityId:101|*|qId:1b8e11de-115a-4191-a0ce-b9e679ae66cb|*|st:city|*|sId:101|*|'
}
pages=10
#爬取深度
url='http://hotel.elong.com/ajax/tmapilist/asyncsearch'
path=r"C:\Users\asus\Desktop\酒店信息.csv"
file=open(path,"a")
file.write("名称"+","+"价格"+","+"评分"+"\n")
file.close()
#打印表头
threads=[]
#线程列表
for page in range(pages):
form_data['listRequest.pageIndex']=str(page)
#动态加载表单
thread=threading.Thread(target=get_detail,args=(url,header,form_data,path),name=str(page))
#多开线程
thread.start()
threads.append(thread)
for each_thread in threads:
each_thread.join()
#线程同步
if __name__ == "__main__":
time1=time.time()
main()
print("Execute over")
time2=time.time()
print(time2-time1)
#耗时1.78(s)
代码至此结束
此栏目旨在共享自学之乐,共勉求知之友,共塑网站和谐好学的形象。
欢迎大家在评论区发表合理的意见和指正。
如果觉得该栏目对您有帮助,望不吝点赞收藏。