python多线程实现访问页面_python实现多线程爬取酒店网页动态加载信息

news/2024/11/24 11:26:28/

最近学习threading库,接触线程部分相关知识。题示 , 保证同机器且爬取深度相同,单线程需要33.72s,多线程仅需1.78s。可见多开线程对于提高I/O密集爬虫的效率,行之有效。

声明:

0.本代码仅限学习交流使用

1.转载附作者ID及原作品链接

2.自觉遵守协议 维护网站权益

单线程全代码 修改path request_header可用:

import re

import requests

import json

import time

def get_detail(url,header,form_data):

response=requests.post(url,headers=header,data=form_data)

response.encoding=response.apparent_encoding

dictionary=json.loads(response.text)

#json格式转换

text=dictionary['value']['hotelListHtml']

#字典取值

hotel_list=text.split('

')

#切片

dict_list=[]

for each_hotel in hotel_list:

try:

name=re.findall(r'class="info_cn">(.*?)',each_hotel)[0]

price=re.findall(r'(.*?)',each_hotel)[0]

score=re.findall(r'data-score="(.*?)"',each_hotel)[0]

new_dict={}

new_dict["name"]=name

new_dict["price"]=price

new_dict["score"]=score

dict_list.append(new_dict)

except:

continue

return dict_list

def save(detail_list,path):

file=open(path,"a")

for each_detail in detail_list:

name=each_detail["name"]

price=each_detail["price"]

score=each_detail["score"]

file.write(name+","+price+","+score+"\n")

file.close()

#保存

def main():

header={

'Accept':'application/json, text/javascript, */*; q=0.01',

'Accept-Encoding':'gzip, deflate',

'Accept-Language':'zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7',

'Connection':'keep-alive',

'Content-Length':'2273',

'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

'Cookie':'CookieGuid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; H5CookieId=db6aea90-999f-4be0-9930-7cfcea066214; _fid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; CitySearchHistory=0101%23%E5%8C%97%E4%BA%AC%23beijing%23; firsttime=1583163644996; SessionGuid=abe14edf-638e-44e5-a26d-94807cbf9e7a; Esid=038b4b4a-fd3a-4c88-bb54-fe60f6a9bf12; com.eLong.CommonService.OrderFromCookieInfo=Orderfromtype=1&Parentid=1000&Status=1&Cookiesdays=30&Coefficient=0.0&Pkid=1105&Priority=9001&Isusefparam=0&Makecomefrom=0&Savecookies=0; fv=pcweb; anti_token=4D71FDF2-2357-461A-984C-56958AE1A7CE; ShHotel=InDate=2020-03-03&CityID=0101&CityNameEN=beijing&CityNameCN=%E5%8C%97%E4%BA%AC&OutDate=2020-03-04&CityName=%E5%8C%97%E4%BA%AC; ext_param=bns%3D4%26ct%3D3; s_cc=true; __tctmc=0.215881358; __tctmc=20377580.26050747; __tctmd=20377580.254392154; __tctma=20377580.1583163637156064.1583163637230.1583163637230.1583200239719.2; __tctmu=20377580.0.0; __tctmz=20377580.1583200239719.2.1.utmccn=(referral)|utmcsr=bing.com|utmcct=|utmcmd=referral; longKey=1583163637156064; __tctrack=0; __tctmd=0.1; lasttime=1583202216497; s_visit=1; User-Ref-SessionId=78bd-e80d-13ad-2b0a-92aa-532a; trace_extend={"deviceid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","appid":"6","userid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","orderfromid":1105,"sessionid":"78bd-e80d-13ad-2b0a-92aa-532a","pvid":"c1b8aaeb"}; __tctmb=0.1284443720926117.1583203761350.1583203761350.1; s_sq=elongcom%3D%2526pid%253Dhotel.elong.com%25252Fbeijing%2526pidt%253D1%2526oid%253Djavascript%25253Avoid(0)%2526ot%253DA; __tccgd=0.0; JSESSIONID=1F53BCB446035ADE53A945505FBF2D47',

'Host':'hotel.elong.com',

'Origin':'http://hotel.elong.com',

'Referer':'http://hotel.elong.com/beijing/',

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',

'X-Requested-With':'XMLHttpRequest'

}

form_data={

'code':'7611836',

'listRequest.areaID':'',

'listRequest.bedLargeTypes':'',

'listRequest.bookingChannel':'1',

'listRequest.breakfasts':'0',

'listRequest.cancelFree':'false',

'listRequest.cardNo':'192928',

'listRequest.checkInDate':'2020-03-03 00:00:00',

'listRequest.checkOutDate':'2020-03-04 00:00:00',

'listRequest.cityID':'0101',

'listRequest.cityName':'北京',

'listRequest.crawledFlag':'0',

'listRequest.customLevel':'11',

'listRequest.discountIds':'',

'listRequest.distance':'20000',

'listRequest.endLat':'0',

'listRequest.endLng':'0',

'listRequest.epcCreateOrderGuideVersion':'C',

'listRequest.facilityIds':'',

'listRequest.guokaoFlag':'false',

'listRequest.highPrice':'0',

'listRequest.hotelBrandIDs':'',

'listRequest.hotelIDs':'',

'listRequest.interceptAction':'0',

'listRequest.isAdvanceSave':'false',

'listRequest.isAfterCouponPrice':'true',

'listRequest.isCoupon':'false',

'listRequest.isDebug':'false',

'listRequest.isLimitTime':'false',

'listRequest.isLogin':'false',

'listRequest.isMobileOnly':'true',

'listRequest.isNeed5Discount':'true',

'listRequest.isNeedNotContractedHotel':'false',

'listRequest.isNeedSimilarPrice':'false',

'listRequest.isReturnNoRoomHotel':'true',

'listRequest.isStaySave':'false',

'listRequest.isTrace':'false',

'listRequest.isUnionSite':'false',

'listRequest.isnstantConfirm':'false',

'listRequest.keywords':'',

'listRequest.keywordsType':'0',

'listRequest.language':'cn',

'listRequest.lat':'39.9059093',

'listRequest.listType':'0',

'listRequest.lng':'116.3913489',

'listRequest.lowPrice':'0',

'listRequest.orderFromID':'1105',

'listRequest.pageIndex':'1',

'listRequest.pageSize':'20',

'listRequest.payMethod':'0',

'listRequest.personOfRoom':'0',

'listRequest.poiId':'0',

'listRequest.poiName':'',

'listRequest.productTypes':'1,6,26',

'listRequest.promotionChannelCode':'0000',

'listRequest.promotionSwitch':'-1',

'listRequest.proxyID':'ZD',

'listRequest.rankType':'0',

'listRequest.returnFilterItem':'true',

'listRequest.sectionId':'',

'listRequest.sellChannel':'1',

'listRequest.seoHotelStar':'0',

'listRequest.sortDirection':'1',

'listRequest.sortMethod':'1',

'listRequest.standBack':'-1',

'listRequest.starLevels':'',

'listRequest.startLat':'0',

'listRequest.startLng':'0',

'listRequest.sug_act_info':'',

'listRequest.taRecommend':'false',

'listRequest.themeIds':'',

'listRequest.traceId':'b19bcbae-5495-4ce0-ad0e-3db4778d75e7',

'listRequest.wordId':'',

'listRequest.wordType':'-1',

'listRequest.elongToken':'2cbbf4d7-ed57-43f8-8842-4a74ad939a46',

'listRequest.trace_token':'|*|cityId:101|*|qId:1b8e11de-115a-4191-a0ce-b9e679ae66cb|*|st:city|*|sId:101|*|'

}

pages=10

#爬取深度

url='http://hotel.elong.com/ajax/tmapilist/asyncsearch'

path=r"C:\Users\asus\Desktop\CS\PYTHON\实例\酒店信息.csv"

file=open(path,"a")

file.write("名称"+","+"价格"+","+"评分"+"\n")

file.close()

#打印表头

for page in range(pages):

form_data['listRequest.pageIndex']=str(page)

#动态加载表单

detail_list=get_detail(url,header,form_data)

save(detail_list,path)

if __name__ == "__main__":

time1=time.time()

main()

print("Execute over")

time2=time.time()

print(time2-time1)

#耗时33.72(s)

代码至此结束

import re

import requests

import json

import time

import threading

def get_detail(url,header,form_data,path):

response=requests.post(url,headers=header,data=form_data)

response.encoding=response.apparent_encoding

dictionary=json.loads(response.text)

#json格式转换

text=dictionary['value']['hotelListHtml']

#字典取值

hotel_list=text.split('

')

#切片

dict_list=[]

for each_hotel in hotel_list:

try:

name=re.findall(r'class="info_cn">(.*?)',each_hotel)[0]

price=re.findall(r'(.*?)',each_hotel)[0]

score=re.findall(r'data-score="(.*?)"',each_hotel)[0]

new_dict={}

new_dict["name"]=name

new_dict["price"]=price

new_dict["score"]=score

dict_list.append(new_dict)

except:

continue

save(dict_list,path)

#函数连接点

def save(detail_list,path):

file=open(path,"a")

for each_detail in detail_list:

name=each_detail["name"]

price=each_detail["price"]

score=each_detail["score"]

file.write(name+","+price+","+score+"\n")

file.close()

#保存

def main():

header={

'Accept':'application/json, text/javascript, */*; q=0.01',

'Accept-Encoding':'gzip, deflate',

'Accept-Language':'zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7',

'Connection':'keep-alive',

'Content-Length':'2273',

'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

'Cookie':'CookieGuid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; H5CookieId=db6aea90-999f-4be0-9930-7cfcea066214; _fid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; CitySearchHistory=0101%23%E5%8C%97%E4%BA%AC%23beijing%23; firsttime=1583163644996; SessionGuid=abe14edf-638e-44e5-a26d-94807cbf9e7a; Esid=038b4b4a-fd3a-4c88-bb54-fe60f6a9bf12; com.eLong.CommonService.OrderFromCookieInfo=Orderfromtype=1&Parentid=1000&Status=1&Cookiesdays=30&Coefficient=0.0&Pkid=1105&Priority=9001&Isusefparam=0&Makecomefrom=0&Savecookies=0; fv=pcweb; anti_token=4D71FDF2-2357-461A-984C-56958AE1A7CE; ShHotel=InDate=2020-03-03&CityID=0101&CityNameEN=beijing&CityNameCN=%E5%8C%97%E4%BA%AC&OutDate=2020-03-04&CityName=%E5%8C%97%E4%BA%AC; ext_param=bns%3D4%26ct%3D3; s_cc=true; __tctmc=0.215881358; __tctmc=20377580.26050747; __tctmd=20377580.254392154; __tctma=20377580.1583163637156064.1583163637230.1583163637230.1583200239719.2; __tctmu=20377580.0.0; __tctmz=20377580.1583200239719.2.1.utmccn=(referral)|utmcsr=bing.com|utmcct=|utmcmd=referral; longKey=1583163637156064; __tctrack=0; __tctmd=0.1; lasttime=1583202216497; s_visit=1; User-Ref-SessionId=78bd-e80d-13ad-2b0a-92aa-532a; trace_extend={"deviceid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","appid":"6","userid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","orderfromid":1105,"sessionid":"78bd-e80d-13ad-2b0a-92aa-532a","pvid":"c1b8aaeb"}; __tctmb=0.1284443720926117.1583203761350.1583203761350.1; s_sq=elongcom%3D%2526pid%253Dhotel.elong.com%25252Fbeijing%2526pidt%253D1%2526oid%253Djavascript%25253Avoid(0)%2526ot%253DA; __tccgd=0.0; JSESSIONID=1F53BCB446035ADE53A945505FBF2D47',

'Host':'hotel.elong.com',

'Origin':'http://hotel.elong.com',

'Referer':'http://hotel.elong.com/beijing/',

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',

'X-Requested-With':'XMLHttpRequest'

}

form_data={

'code':'7611836',

'listRequest.areaID':'',

'listRequest.bedLargeTypes':'',

'listRequest.bookingChannel':'1',

'listRequest.breakfasts':'0',

'listRequest.cancelFree':'false',

'listRequest.cardNo':'192928',

'listRequest.checkInDate':'2020-03-03 00:00:00',

'listRequest.checkOutDate':'2020-03-04 00:00:00',

'listRequest.cityID':'0101',

'listRequest.cityName':'北京',

'listRequest.crawledFlag':'0',

'listRequest.customLevel':'11',

'listRequest.discountIds':'',

'listRequest.distance':'20000',

'listRequest.endLat':'0',

'listRequest.endLng':'0',

'listRequest.epcCreateOrderGuideVersion':'C',

'listRequest.facilityIds':'',

'listRequest.guokaoFlag':'false',

'listRequest.highPrice':'0',

'listRequest.hotelBrandIDs':'',

'listRequest.hotelIDs':'',

'listRequest.interceptAction':'0',

'listRequest.isAdvanceSave':'false',

'listRequest.isAfterCouponPrice':'true',

'listRequest.isCoupon':'false',

'listRequest.isDebug':'false',

'listRequest.isLimitTime':'false',

'listRequest.isLogin':'false',

'listRequest.isMobileOnly':'true',

'listRequest.isNeed5Discount':'true',

'listRequest.isNeedNotContractedHotel':'false',

'listRequest.isNeedSimilarPrice':'false',

'listRequest.isReturnNoRoomHotel':'true',

'listRequest.isStaySave':'false',

'listRequest.isTrace':'false',

'listRequest.isUnionSite':'false',

'listRequest.isnstantConfirm':'false',

'listRequest.keywords':'',

'listRequest.keywordsType':'0',

'listRequest.language':'cn',

'listRequest.lat':'39.9059093',

'listRequest.listType':'0',

'listRequest.lng':'116.3913489',

'listRequest.lowPrice':'0',

'listRequest.orderFromID':'1105',

'listRequest.pageIndex':'1',

'listRequest.pageSize':'20',

'listRequest.payMethod':'0',

'listRequest.personOfRoom':'0',

'listRequest.poiId':'0',

'listRequest.poiName':'',

'listRequest.productTypes':'1,6,26',

'listRequest.promotionChannelCode':'0000',

'listRequest.promotionSwitch':'-1',

'listRequest.proxyID':'ZD',

'listRequest.rankType':'0',

'listRequest.returnFilterItem':'true',

'listRequest.sectionId':'',

'listRequest.sellChannel':'1',

'listRequest.seoHotelStar':'0',

'listRequest.sortDirection':'1',

'listRequest.sortMethod':'1',

'listRequest.standBack':'-1',

'listRequest.starLevels':'',

'listRequest.startLat':'0',

'listRequest.startLng':'0',

'listRequest.sug_act_info':'',

'listRequest.taRecommend':'false',

'listRequest.themeIds':'',

'listRequest.traceId':'b19bcbae-5495-4ce0-ad0e-3db4778d75e7',

'listRequest.wordId':'',

'listRequest.wordType':'-1',

'listRequest.elongToken':'2cbbf4d7-ed57-43f8-8842-4a74ad939a46',

'listRequest.trace_token':'|*|cityId:101|*|qId:1b8e11de-115a-4191-a0ce-b9e679ae66cb|*|st:city|*|sId:101|*|'

}

pages=10

#爬取深度

url='http://hotel.elong.com/ajax/tmapilist/asyncsearch'

path=r"C:\Users\asus\Desktop\酒店信息.csv"

file=open(path,"a")

file.write("名称"+","+"价格"+","+"评分"+"\n")

file.close()

#打印表头

threads=[]

#线程列表

for page in range(pages):

form_data['listRequest.pageIndex']=str(page)

#动态加载表单

thread=threading.Thread(target=get_detail,args=(url,header,form_data,path),name=str(page))

#多开线程

thread.start()

threads.append(thread)

for each_thread in threads:

each_thread.join()

#线程同步

if __name__ == "__main__":

time1=time.time()

main()

print("Execute over")

time2=time.time()

print(time2-time1)

#耗时1.78(s)

代码至此结束

此栏目旨在共享自学之乐,共勉求知之友,共塑网站和谐好学的形象。

欢迎大家在评论区发表合理的意见和指正。

如果觉得该栏目对您有帮助,望不吝点赞收藏。


http://www.ppmy.cn/news/819452.html

相关文章

两名一流高校硕士同年毕业论文高度雷同惹争议,怎么过的查重?

点击上方“3D视觉工坊”,选择“星标” 干货第一时间送达 导读: 近日,又现多起研究生论文涉嫌抄袭事件。相关高校也都对此迅速做出了回应,这也是继八月份教育部发布文件要求各高校要立即采取行动,全面复核、排查近5年&a…

物联大世界 2019年国际物联网展会5月在北京盛大召开

时间2019年05月16-18日 地点 中国•北京亦创国际会展中心4组织单位 特邀单位: 商务部批准单位:北京市商务委员会主办单位:中国电子商会物联网技术产品应用专业委员会北京铭世博国际展览有限公司 支持单位: 中国智能家居产业联盟 中…

2019亚洲北京新零售展览会-展会新闻资讯

时间:2019年6月28-30日 地点:中国北京亦创国际会展中心邀 请 函主办单位: 中国商业联合会智慧商业分会北京嵌入式系统技术行业协会北京电子电器协会北京电子学会协办单位: 北京市协会北京市商业联合会北京市连锁协会北京市海淀区商业联合会天津百货商业协…

中文街景店铺检测和识别数据集

中文 Photo OCR 街景数据集ICDAR-ReCTS中文街景数据集CTWShopSign(中英文) Baseline-scene text detectionCTPNTextBoxesEASTConceptual Text Region Network: Cognition-Inspired Accurate Scene Text Detection Baseline-scene text recognitionCRNNSl…

爬虫 | 打印page_source+正则匹配

美团图片爬取 1 背景2 数据准备2.1 读入数据2.2 查看美团商户网址前五个看看 3 汇总3.1 定义正则匹配网址函数3.1.1 匹配大大图3.1.2 匹配大图3.1.3 匹配推荐菜 3.2 测试上述函数3.2.1 大大图3.2.2 大图3.2.3 推荐菜 3.3 整体进行爬取 4 找到空文件夹 然后把名称记录下来 然后拼…

2019北京物联网智慧城市大数据博览会开启中国之路

邀 请 函 时间2019年05月16-18日 地点 中国•北京亦创国际会展中心4组织单位 特邀单位: 商务部批准单位:北京市商务委员会主办单位:中国电子商会物联网技术产品应用专业委员会北京铭世博国际展览有限公司 支持单位: 中国智能家居产…

“3D打印”的魔法时代还有多远?

2009年,24岁的浙江金华人金涛,从浙江大学计算机专业研究生毕业后,在香港做了不到一年博士,就决定回内地创业。启发他创业灵感的,是国外一家电子礼品店的个性3D打印服务。 当时,面向普通人的3D打印服务在国内…

极兔快递电子面单打印API接口-极兔快递

前言 J&T 极兔速递是一家科技创新型互联网快递物流企业,致力于为用户带来优质的快递和物流体验。 2015年8月由印尼首都雅加达作为起点,进入快递物流市场,目前覆盖了印度尼西亚、越南、马来西亚、泰国、菲律宾、柬埔寨及新加坡七个国家,成为东南亚超过5.5亿人口信赖的…