1.读取网页(俩种方式,按行读与全部读)
import urllib.request #请求#一次全部读取网页源码
#mystr = urllib.request.urlopen("http://www.baidu.com").read()
#print(mystr.decode("utf-8"))
#按行读取
for line in urllib.request.urlopen("http://www.baidu.com"):print(line.decode("utf-8"))
2.爬取邮箱
import urllib.request
import remailregex=re.compile(R"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})",re.IGNORECASE)
'''
#按行读取
for line in urllib.request.urlopen("http://bbs.tianya.cn/post-140-393974-1.shtml"):mylist = mailregex.findall(line.decode("utf-8"))if mylist: #找到才输出print(mylist)
'''
#一次全部读取网页源码
mystr = urllib.request.urlopen("http://bbs.tianya.cn/post-140-393974-1.shtml").read()
mylist = mailregex.findall(mystr.decode("utf-8"))
print(mylist)
3.爬取QQ
import urllib.request
import reQQregex=re.compile(r"[1-9]\d{4,10}",re.IGNORECASE)#按行读取
for line in urllib.request.urlopen("http://bbs.tianya.cn/post-140-393974-1.shtml"):line=line.decode("utf-8")if line.find("QQ")!=-1 or line.find("Qq")!=-1 or line.find("qq")!=-1:mylist = QQregex.findall(line)if mylist: #找到才输出print(mylist)
4.爬取链接
import urllib.request
import re
#(http://\S*?)[\"] 提取以"终止,不带"
#http://\S*?[\"] 提取以"终止,带"
#\S非空字符 ,*0个或多个,?非贪婪 \"|>|)三个字符之一结束
httpregex=re.compile(r"(http://\S*?)[\"|>|)]",re.IGNORECASE)
#按行读取
for line in urllib.request.urlopen("http://www.baidu.com"):line=line.decode("utf-8")mylist = httpregex.findall(line)if mylist: #找到才输出print(mylist)
5.爬取相对路径
import urllib.request
import urllib
import redef getallyurl(data):alllist=[]mylist1=[]mylist2=[]mylist1=gethttp(data)if len(mylist1)>0:mylist2=getabsyurl(mylist1[0],data)alllist.extend(mylist1)alllist.extend(mylist2)return alllist
def getabsyurl(url,data):try:regex=re.compile("href=\"(.*?)\"",re.IGNORECASE)httplist=regex.findall(data)newhttplist=httplist.copy() #深拷贝for data in newhttplist:if data.find("http://")!=-1:httplist.remove(data)if data.find("javascript")!=-1:httplist.remove(data)hostname=gethostname(url)if hostname!=None:for i in range(len(httplist)):httplist[i]=hostname+httplist[i]return httplistexcept:return ""
def gethostname(httpstr): #抓取主机名称try:mailregex = re.compile(r"(http://\S*?)/", re.IGNORECASE)mylist = mailregex.findall(httpstr)if len(mylist)==0:return Noneelse:return mylist[0]except:return None
def gethttp(data):try:mailregex = re.compile(r"(http://\S*?)[\"|>|)]", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getallemail(data):try:mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getdata(url):try:data = urllib.request.urlopen(url).read().decode("utf-8")return dataexcept:return "" #发生异常返回空#print(gethttp(getdata("http://bbs.tianya.cn/post-140-393974-1.shtml")))pagedata=getdata("http://bbs.tianya.cn/post-140-393974-1.shtml")
print(getallyurl(pagedata))
'''
mylist=gethttp(pagedata)
hostname=gethostname(mylist[0])
print(hostname)
print(getabsyurl(mylist[0],pagedata))
'''
6.深度遍历网页打印邮箱地址(DFS)使用堆栈实现
import urllib.request
import urllib
import re
def getallyurl(data):alllist=[]mylist1=[]mylist2=[]mylist1=gethttp(data)if len(mylist1)>0:mylist2=getabsyurl(mylist1[0],data)alllist.extend(mylist1)alllist.extend(mylist2)return alllist
def getabsyurl(url,data):try:regex=re.compile("href=\"(.*?)\"",re.IGNORECASE)httplist=regex.findall(data)newhttplist=httplist.copy() #深拷贝for data in newhttplist:if data.find("http://")!=-1:httplist.remove(data)if data.find("javascript")!=-1:httplist.remove(data)hostname=gethostname(url)if hostname!=None:for i in range(len(httplist)):httplist[i]=hostname+httplist[i]return httplistexcept:return ""
def gethostname(httpstr): #抓取主机名称try:mailregex = re.compile(r"(http://\S*?)/", re.IGNORECASE)mylist = mailregex.findall(httpstr)if len(mylist)==0:return Noneelse:return mylist[0]except:return None
def gethttp(data):try:mailregex = re.compile(r"(http://\S*?)[\"|>|)]", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getallemail(data):try:mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getdata(url):try:data = urllib.request.urlopen(url).read().decode("utf-8")return dataexcept:return "" #发生异常返回空def DFS(urlstr):visitlist=[] #代表已经访问过的 因为深度遍历容易陷入死循环urlstack=[] #新建栈urlstack.append(urlstr)while len(urlstack)!=0:url=urlstack.pop() #堆栈弹出数据print(url) #打印url连接if url not in visitlist:pagedata=getdata(url) #获取网页源码emaillist=getallemail(pagedata) #提取邮箱到列表if len(emaillist)!=0: #邮箱不为空for email in emaillist: #打印所有邮箱print(email)newurllist=getallyurl(pagedata) #抓取所有的urlif len(newurllist)!=0: #p判断长度for urlstr in newurllist: #循环处理所有urlif urlstr not in urlstack: #判断存在或者不存在urlstack.append(urlstr) #插入visitlist.append(url)
DFS("http://bbs.tianya.cn/post-140-393974-5.shtml")
#DFS("http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&srcqid=4974407339122272475&tn=48020221_29_hao_pg&wd=%E5%B2%9B%E5%9B%BD%E5%A4%A7%E7%89%87%20%E7%95%99%E4%B8%8B%E9%82%AE%E7%AE%B1&oq=%25E5%25A4%25A9%25E6%25B6%25AF%25E5%25A4%25A7%25E5%25AD%25A6%25E8%2580%2581%25E5%25B8%2588%25E9%2582%25AE%25E7%25AE%25B1&rsv_pq=e1e17d5400093975&rsv_t=83fc1KipT0e6dU2l8G8651PAihzqMxhN1tT8Ue1JiKtvBGgKILwuquM4g7%2BKNKKKp6AkBxK7opGg&rqlang=cn&rsv_enter=1&rsv_dl=tb&rsv_sug3=40&rsv_sug1=4&rsv_sug7=100&rsv_sug2=0&rsv_btype=t&inputT=11395&rsv_sug4=11395")
7.广度遍历网页打印邮箱地址(BFS)使用队列
import urllib.request
import urllib
import re
from collections import deque
def getallyurl(data):alllist=[]mylist1=[]mylist2=[]mylist1=gethttp(data)if len(mylist1)>0:mylist2=getabsyurl(mylist1[0],data)alllist.extend(mylist1)alllist.extend(mylist2)return alllist
def getabsyurl(url,data):try:regex=re.compile("href=\"(.*?)\"",re.IGNORECASE)httplist=regex.findall(data)newhttplist=httplist.copy() #深拷贝for data in newhttplist:if data.find("http://")!=-1:httplist.remove(data)if data.find("javascript")!=-1:httplist.remove(data)hostname=gethostname(url)if hostname!=None:for i in range(len(httplist)):httplist[i]=hostname+httplist[i]return httplistexcept:return ""
def gethostname(httpstr): #抓取主机名称try:mailregex = re.compile(r"(http://\S*?)/", re.IGNORECASE)mylist = mailregex.findall(httpstr)if len(mylist)==0:return Noneelse:return mylist[0]except:return None
def gethttp(data):try:mailregex = re.compile(r"(http://\S*?)[\"|>|)]", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getallemail(data):try:mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getdata(url):try:data = urllib.request.urlopen(url).read().decode("utf-8")return dataexcept:return "" #发生异常返回空def BFS(urlstr):urlque=deque([]) #新建队列urlque.append(urlstr)while len(urlque)!=0:url=urlque.popleft() #队列弹出数据print(url) #打印url连接pagedata=getdata(url) #获取网页源码emaillist=getallemail(pagedata) #提取邮箱到列表if len(emaillist)!=0: #邮箱不为空for email in emaillist: #打印所有邮箱print(email)newurllist=getallyurl(pagedata) #抓取所有的urlif len(newurllist)!=0: #p判断长度for urlstr in newurllist: #循环处理所有urlif urlstr not in urlque: #判断存在或者不存在urlque.append(urlstr) #插入
#BFS("http://bbs.tianya.cn/post-140-393974-5.shtml")
BFS("http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&srcqid=4974407339122272475&tn=48020221_29_hao_pg&wd=%E5%B2%9B%E5%9B%BD%E5%A4%A7%E7%89%87%20%E7%95%99%E4%B8%8B%E9%82%AE%E7%AE%B1&oq=%25E5%25A4%25A9%25E6%25B6%25AF%25E5%25A4%25A7%25E5%25AD%25A6%25E8%2580%2581%25E5%25B8%2588%25E9%2582%25AE%25E7%25AE%25B1&rsv_pq=e1e17d5400093975&rsv_t=83fc1KipT0e6dU2l8G8651PAihzqMxhN1tT8Ue1JiKtvBGgKILwuquM4g7%2BKNKKKp6AkBxK7opGg&rqlang=cn&rsv_enter=1&rsv_dl=tb&rsv_sug3=40&rsv_sug1=4&rsv_sug7=100&rsv_sug2=0&rsv_btype=t&inputT=11395&rsv_sug4=11395")