这份爬取主要借助爬虫常用的三个库: requests BeautifulSoup 和 re,requests主要是请求网页
BeautifulSoup是标准的解析网页信息库,re是python自带的实现正则表达式库。
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import re#设置请求头
url = 'https://news.ifeng.com/c/7u3cafzm3Ki'
headers = {'Accept': '*/*','Accept - Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.9','Connection':'keep-alive','Host':'news.ifeng.com','User-Agent':'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36'}
#导入爬取对象
def pageget(url):try:responds = requests.get(url,headers = headers)if responds.status_code == 200:#检查是否导入成功return responds.textreturn Noneexcept RequetsException:return Nonepassage = pageget(url)
###利用BeautiSoup规则调取文章
soup = BeautifulSoup(passage)
##调取文章标题
souptext = soup.select('h1[class="topic-3bY8Hw-9"]')
#将列表信息转字符型
souptext2 = str(souptext[0])
#利用re.sub规则替换 <>为空白
souptext3 = re.sub('<h1 class="topic-3bY8Hw-9">','',souptext2)
title = str(re.sub('</h1>','',souptext3))
##调取文章主体内容
zhuti = str(soup.select('div[class="text-3zQ3cZD4"]')[0])
#利用re.sub 规则替换 <div> 为空 ;替换 <p>为‘ ’ ;替换 </p>为 换行 /n
zhuti2 = re.sub('\<\/.*?\>','\n',zhuti)
zhuti3 = re.sub('\<.*?\>',' ',zhuti2)