xpath是什么
xpath是在XML文档中搜索内容的一门语言
html是xml的一个子集
具体实现
主要依靠lxml类中的etree
demo代码
用法1、XML
from lxml import etree xml = """
<book> <id>1</id> <name> <nick id="10086">2112</nick> <nick id="10010">4555</nick> <nick class="joy">fgbgn</nick> <nick class="jolin">goog</nick> <dir> <nick class="ksj">gogofff</nick> <dir> <nick class="ksj">ggogo</nick> </dir> </dir> </name></book>
""" tree=etree.XML(xml) # rer = tree.xpath("/book") /表示层级
#
# rerr= tree.xpath("/book/id") /表示层级
#
# result = tree.xpath("/book/id/text()") /输出id层级的内容 test()拿文本
# result = tree.xpath("/book/name/nick/text()") #输出nick里的所有内容
# result = tree.xpath("/book/name/dir/nick/text()") #输出dir中nick的内容
# result = tree.xpath("/book/name//nick/text()") #输出name节点中的所有节点中所有的nick节点的内容,//后代
result = tree.xpath("/book/name/*/nick/text()") # *匹配任意节点,只匹配一层 print(result)
用法2、parse
from lxml import etree tree =etree.parse("b.html") result = tree.xpath("/html/a[@href='dabao']/text()") #从html中a里面的href=dapao的数据 result = tree.xpath("/html/@href") #输出html中href中的数据 比如dapao ol_li_list=tree.xpath("/html/a") for li in ol_li_list: result=li.xpath("./o/text()") #查找html/a/o/中的结果 result2=li.xpath("./o/@href") #查找/html/a/o/中href=xxx的内容
用法3、HTML
newstart
import requests
from lxml import etree
url ='http://eci-2zedc18yc0kv02b4wfp5.cloudeci1.ichunqiu.com/start'
url1='http://eci-2zedc18yc0kv02b4wfp5.cloudeci1.ichunqiu.com/submit'
s=requests.session()
a=s.get(url).content
tree=etree.HTML(a)
titles = tree.xpath('//p/text()')
result = "".join(titles)
data={ "user_input":{result}
}
d=s.post(url1,data=data)
print(d.text)
几行就没了,比re和bs4都快,但是可能没有re灵活
爬取blog信息练手
本来应该爬猪八戒网的,但是现在搞了个二次认证,不太好爬取
蹭蹭blog(以得到同意,不要模仿)
import requests
from lxml import etree url = 'https://noobxiaomeng.top/' rspons = requests.get(url) rel=rspons.content tree =etree.HTML(rel) rell=tree.xpath("//div/h6/text()") #查找个人信息 for i in rell: print(i) relll=tree.xpath("//header/a[@class='post-title']/text()") #爬取文章标题 rellll=tree.xpath("//header/a[@class='post-title']/@href") #爬取文章链接 for i,j in zip(relll,rellll): print(i+":"+j+'\n')
与bs4爬取图库对比
bs4:
import requests
from bs4 import BeautifulSoup url = "https://www.umei.cc/bizhitupian/weimeibizhi/"
resp = requests.get(url=url)
resp.encoding = 'utf-8'
re = BeautifulSoup(resp.text,"html.parser")
ac = re.find("div",class_="Clbc_r_cont").find_all("a")
for a in ac: href = a.get("href") print(href) url2 = "https://www.umei.cc/"+href page_re = requests.get(url2) page_re.encoding = 'utf-8' page_text = page_re.text page = BeautifulSoup(page_text,"html.parser") p=page.find("div",class_="big-pic") img = p.find("img") src = img.get("src") #下载 img_re = requests.get(src) img_re.content img_name = src.split("/")[-1] with open(img_name,mode="wb") as f: f.write(img_re.content) #图片保存 print("ob")
print("over")
import requests
from lxml import etree url = 'https://www.umei.cc/katongdongman/dongmanbizhi/index_2.htm' #不存在1,1就不要添加后面的index,所以从2开始,多页面也可以进行遍历,没搞了
re1=requests.get(url).content
tree1 = etree.HTML(re1)
tll=tree1.xpath('//div[@class="btns"]/a/@href') #读取页面链接
for i in tll: url2 ="https://www.umei.cc"+i re2=requests.get(url2).content tree2=etree.HTML(re2) tlll = tree2.xpath('//div[@class="big-pic"]//img/@title') #匹配标题 tllll = tree2.xpath('//div[@class="big-pic"]//img/@src') #匹配链接 img_name = "".join(tlll)+'.jpg' url3 = "".join(tllll) img_re = requests.get(url=url3) img_re.content with open(img_name, mode="wb") as f: f.write(img_re.content) # 图片保存 print("ob")
print('over')
可以发现比bs4简单