待爬取网页:
代码:
import requestsfrom lxml import etree
import pandas as pdfrom lxml import html
import xlwturl = "https://www.haohua.com/xianhua/"header = {"accept":"image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8","accept-encoding":"gzip, deflate, br, zstd","accept-language":"zh-CN,zh;q=0.9","cookie":"MUID=35169CD2EDEA6D7E149B88BEECB06C7B; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=06DEDF3E60F3437B9D1E0E0541286638&dmnchg=1; MUIDB=35169CD2EDEA6D7E149B88BEECB06C7B; MMCASM=ID=5709703A12A449E3A5153FAA872F0450; _UR=QS=0&TQS=0&Pn=1; _TTSS_IN=hist=WyJ6aC1IYW5zIiwiZW4iLCJhdXRvLWRldGVjdCJd&isADRU=0; _TTSS_OUT=hist=WyJlbiIsInpoLUhhbnMiXQ==; _tarLang=default=zh-Hans&newFeature=tonetranslation; _EDGE_S=SID=10AB24CBE0666F783D443148E1B46E27; _Rwho=u=d&ts=2025-01-29; _SS=SID=10AB24CBE0666F783D443148E1B46E27&R=200&RB=0&GB=0&RG=200&RP=200&PC=U316; SRCHUSR=DOB=20240521&T=1738198155000&TPC=1736825154000; USRLOC=HS=1&ELOC=LAT=31.554468154907227|LON=117.24475860595703|N=%E8%82%A5%E8%A5%BF%E5%8E%BF%EF%BC%8C%E5%AE%89%E5%BE%BD%E7%9C%81|ELT=4|; SNRHOP=I=&TS=; _HPVN=CS=eyJQbiI6eyJDbiI6ODksIlN0IjoxLCJRcyI6MCwiUHJvZCI6IlAifSwiU2MiOnsiQ24iOjg5LCJTdCI6MCwiUXMiOjAsIlByb2QiOiJIIn0sIlF6Ijp7IkNuIjo4OSwiU3QiOjAsIlFzIjowLCJQcm9kIjoiVCJ9LCJBcCI6dHJ1ZSwiTXV0ZSI6dHJ1ZSwiTGFkIjoiMjAyNS0wMS0zMFQwMDowMDowMFoiLCJJb3RkIjowLCJHd2IiOjAsIlRucyI6MCwiRGZ0IjpudWxsLCJNdnMiOjAsIkZsdCI6MCwiSW1wIjo2MDgsIlRvYm4iOjB9; _RwBf=r=0&ilt=835&ihpd=0&ispd=8&rc=200&rb=0&gb=0&rg=200&pc=200&mtu=0&rbb=0&g=0&cid=&clo=0&v=15&l=2025-01-29T08:00:00.0000000Z&lft=2025-01-13T00:00:00.0000000-08:00&aof=0&ard=0001-01-01T00:00:00.0000000&rwdbt=0&rwflt=0&o=2&p=&c=&t=0&s=0001-01-01T00:00:00.0000000+00:00&ts=2025-01-30T01:37:12.0686804+00:00&rwred=0&wls=&wlb=&wle=&ccp=&cpt=&lka=0&lkt=0&aad=0&TH=&rwaul2=0; SRCHHPGUSR=SRCHLANG=zh-Hans&BRW=XW&BRH=S&CW=1495&CH=217&SCW=1479&SCH=217&DPR=1.5&UTC=480&DM=0&WTS=63873794963&PRVCW=1494&PRVCH=765&PV=15.0.0&HV=1738201032&BZA=0&WEBTHEME=0&THEME=0&EXLTT=31&AV=14&ADV=14&RB=0&MB=0","ect":"4g","priority":"i","referer":"https://cn.bing.com/chrome/newtab","sec-ch-ua":'"Not A(Brand";v="8", "Chromium";v="132", "Google Chrome";v="132"',"sec-ch-ua-arch":"x86","sec-ch-ua-bitness":"64","sec-ch-ua-full-version":"132.0.6834.111","sec-ch-ua-full-version-list":'"Not A(Brand";v="8.0.0.0", "Chromium";v="132.0.6834.111", "Google Chrome";v="132.0.6834.111"',"sec-ch-ua-mobile":"?0","sec-ch-ua-model":"","sec-ch-ua-platform":"Windows","sec-ch-ua-platform-version":"15.0.0","sec-fetch-dest":"image","sec-fetch-mode":"no-cors","sec-fetch-site":"same-origin","user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
}
response = requests.get(url = url,headers = header)response.encoding = "utf-8"
# print(response.text)# price = tree.xpath('//a[@class="info imghover"]/p[@class="price b"]/span[not(@class)]/text()')
#
# print(price[0].strip())html = etree.HTML(response.text)# print(html)fresh_flowers = []
popularity = []
original_price = []
now_price = []name = html.xpath('//a[@class = "info imghover"]/h5')for i in name:fresh_flowers.append(i.text)# for i in xianhua_name:
# print(i)price = html.xpath('//a[@class = "info imghover"]/p')for i in price:original_price.append(i[1].text)popularity.append(i[2].text)datalist = []
datalist.append(fresh_flowers)
datalist.append(original_price)
datalist.append(popularity)# 将数据组织成字典
data = {"fresh_flowers": fresh_flowers,"original_price": original_price,"popularity": popularity
}# 创建DataFrame
df = pd.DataFrame(data)# 将DataFrame写入Excel文件
df.to_excel("xianhua_data.xlsx", index=False)print("数据已成功写入Excel文件")# print(len(xianhua_name))
# print(len(original_price))
# print(len(popularity))
结果文件: