python自学-class19(down)-学习爬虫基础

news/2024/11/16 21:42:57/

1.读取网页(俩种方式,按行读与全部读)

import urllib.request  #请求#一次全部读取网页源码
#mystr = urllib.request.urlopen("http://www.baidu.com").read()
#print(mystr.decode("utf-8"))
#按行读取
for line in urllib.request.urlopen("http://www.baidu.com"):print(line.decode("utf-8"))

2.爬取邮箱

import urllib.request
import remailregex=re.compile(R"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})",re.IGNORECASE)
'''
#按行读取
for line in urllib.request.urlopen("http://bbs.tianya.cn/post-140-393974-1.shtml"):mylist = mailregex.findall(line.decode("utf-8"))if mylist:        #找到才输出print(mylist)
'''
#一次全部读取网页源码
mystr = urllib.request.urlopen("http://bbs.tianya.cn/post-140-393974-1.shtml").read()
mylist = mailregex.findall(mystr.decode("utf-8"))
print(mylist)

在这里插入图片描述

3.爬取QQ

import urllib.request
import reQQregex=re.compile(r"[1-9]\d{4,10}",re.IGNORECASE)#按行读取
for line in urllib.request.urlopen("http://bbs.tianya.cn/post-140-393974-1.shtml"):line=line.decode("utf-8")if line.find("QQ")!=-1 or line.find("Qq")!=-1 or line.find("qq")!=-1:mylist = QQregex.findall(line)if mylist:        #找到才输出print(mylist)

在这里插入图片描述
4.爬取链接

import urllib.request
import re
#(http://\S*?)[\"]    提取以"终止,不带"
#http://\S*?[\"]    提取以"终止,带"
#\S非空字符  ,*0个或多个,?非贪婪 \"|>|)三个字符之一结束
httpregex=re.compile(r"(http://\S*?)[\"|>|)]",re.IGNORECASE)
#按行读取
for line in urllib.request.urlopen("http://www.baidu.com"):line=line.decode("utf-8")mylist = httpregex.findall(line)if mylist:        #找到才输出print(mylist)

在这里插入图片描述
5.爬取相对路径

import urllib.request
import urllib
import redef getallyurl(data):alllist=[]mylist1=[]mylist2=[]mylist1=gethttp(data)if len(mylist1)>0:mylist2=getabsyurl(mylist1[0],data)alllist.extend(mylist1)alllist.extend(mylist2)return alllist
def getabsyurl(url,data):try:regex=re.compile("href=\"(.*?)\"",re.IGNORECASE)httplist=regex.findall(data)newhttplist=httplist.copy()  #深拷贝for data in newhttplist:if data.find("http://")!=-1:httplist.remove(data)if data.find("javascript")!=-1:httplist.remove(data)hostname=gethostname(url)if hostname!=None:for i in range(len(httplist)):httplist[i]=hostname+httplist[i]return httplistexcept:return ""
def gethostname(httpstr):   #抓取主机名称try:mailregex = re.compile(r"(http://\S*?)/", re.IGNORECASE)mylist = mailregex.findall(httpstr)if len(mylist)==0:return Noneelse:return mylist[0]except:return None
def gethttp(data):try:mailregex = re.compile(r"(http://\S*?)[\"|>|)]", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getallemail(data):try:mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getdata(url):try:data = urllib.request.urlopen(url).read().decode("utf-8")return dataexcept:return ""  #发生异常返回空#print(gethttp(getdata("http://bbs.tianya.cn/post-140-393974-1.shtml")))pagedata=getdata("http://bbs.tianya.cn/post-140-393974-1.shtml")
print(getallyurl(pagedata))
'''
mylist=gethttp(pagedata)
hostname=gethostname(mylist[0])
print(hostname)
print(getabsyurl(mylist[0],pagedata))
'''

6.深度遍历网页打印邮箱地址(DFS)使用堆栈实现

import urllib.request
import urllib
import re
def getallyurl(data):alllist=[]mylist1=[]mylist2=[]mylist1=gethttp(data)if len(mylist1)>0:mylist2=getabsyurl(mylist1[0],data)alllist.extend(mylist1)alllist.extend(mylist2)return alllist
def getabsyurl(url,data):try:regex=re.compile("href=\"(.*?)\"",re.IGNORECASE)httplist=regex.findall(data)newhttplist=httplist.copy()  #深拷贝for data in newhttplist:if data.find("http://")!=-1:httplist.remove(data)if data.find("javascript")!=-1:httplist.remove(data)hostname=gethostname(url)if hostname!=None:for i in range(len(httplist)):httplist[i]=hostname+httplist[i]return httplistexcept:return ""
def gethostname(httpstr):   #抓取主机名称try:mailregex = re.compile(r"(http://\S*?)/", re.IGNORECASE)mylist = mailregex.findall(httpstr)if len(mylist)==0:return Noneelse:return mylist[0]except:return None
def gethttp(data):try:mailregex = re.compile(r"(http://\S*?)[\"|>|)]", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getallemail(data):try:mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getdata(url):try:data = urllib.request.urlopen(url).read().decode("utf-8")return dataexcept:return ""  #发生异常返回空def DFS(urlstr):visitlist=[]  #代表已经访问过的   因为深度遍历容易陷入死循环urlstack=[]  #新建栈urlstack.append(urlstr)while len(urlstack)!=0:url=urlstack.pop() #堆栈弹出数据print(url)  #打印url连接if url not in visitlist:pagedata=getdata(url)   #获取网页源码emaillist=getallemail(pagedata)  #提取邮箱到列表if len(emaillist)!=0:    #邮箱不为空for email in emaillist:  #打印所有邮箱print(email)newurllist=getallyurl(pagedata)  #抓取所有的urlif len(newurllist)!=0:   #p判断长度for urlstr in newurllist:   #循环处理所有urlif urlstr not in urlstack:  #判断存在或者不存在urlstack.append(urlstr)      #插入visitlist.append(url)
DFS("http://bbs.tianya.cn/post-140-393974-5.shtml")
#DFS("http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&srcqid=4974407339122272475&tn=48020221_29_hao_pg&wd=%E5%B2%9B%E5%9B%BD%E5%A4%A7%E7%89%87%20%E7%95%99%E4%B8%8B%E9%82%AE%E7%AE%B1&oq=%25E5%25A4%25A9%25E6%25B6%25AF%25E5%25A4%25A7%25E5%25AD%25A6%25E8%2580%2581%25E5%25B8%2588%25E9%2582%25AE%25E7%25AE%25B1&rsv_pq=e1e17d5400093975&rsv_t=83fc1KipT0e6dU2l8G8651PAihzqMxhN1tT8Ue1JiKtvBGgKILwuquM4g7%2BKNKKKp6AkBxK7opGg&rqlang=cn&rsv_enter=1&rsv_dl=tb&rsv_sug3=40&rsv_sug1=4&rsv_sug7=100&rsv_sug2=0&rsv_btype=t&inputT=11395&rsv_sug4=11395")

7.广度遍历网页打印邮箱地址(BFS)使用队列

import urllib.request
import urllib
import re
from collections import deque
def getallyurl(data):alllist=[]mylist1=[]mylist2=[]mylist1=gethttp(data)if len(mylist1)>0:mylist2=getabsyurl(mylist1[0],data)alllist.extend(mylist1)alllist.extend(mylist2)return alllist
def getabsyurl(url,data):try:regex=re.compile("href=\"(.*?)\"",re.IGNORECASE)httplist=regex.findall(data)newhttplist=httplist.copy()  #深拷贝for data in newhttplist:if data.find("http://")!=-1:httplist.remove(data)if data.find("javascript")!=-1:httplist.remove(data)hostname=gethostname(url)if hostname!=None:for i in range(len(httplist)):httplist[i]=hostname+httplist[i]return httplistexcept:return ""
def gethostname(httpstr):   #抓取主机名称try:mailregex = re.compile(r"(http://\S*?)/", re.IGNORECASE)mylist = mailregex.findall(httpstr)if len(mylist)==0:return Noneelse:return mylist[0]except:return None
def gethttp(data):try:mailregex = re.compile(r"(http://\S*?)[\"|>|)]", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getallemail(data):try:mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})", re.IGNORECASE)mylist = mailregex.findall(data)return mylistexcept:return ""
def getdata(url):try:data = urllib.request.urlopen(url).read().decode("utf-8")return dataexcept:return ""  #发生异常返回空def BFS(urlstr):urlque=deque([])  #新建队列urlque.append(urlstr)while len(urlque)!=0:url=urlque.popleft() #队列弹出数据print(url)  #打印url连接pagedata=getdata(url)   #获取网页源码emaillist=getallemail(pagedata)  #提取邮箱到列表if len(emaillist)!=0:    #邮箱不为空for email in emaillist:  #打印所有邮箱print(email)newurllist=getallyurl(pagedata)  #抓取所有的urlif len(newurllist)!=0:   #p判断长度for urlstr in newurllist:   #循环处理所有urlif urlstr not in urlque:  #判断存在或者不存在urlque.append(urlstr)      #插入
#BFS("http://bbs.tianya.cn/post-140-393974-5.shtml")
BFS("http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&srcqid=4974407339122272475&tn=48020221_29_hao_pg&wd=%E5%B2%9B%E5%9B%BD%E5%A4%A7%E7%89%87%20%E7%95%99%E4%B8%8B%E9%82%AE%E7%AE%B1&oq=%25E5%25A4%25A9%25E6%25B6%25AF%25E5%25A4%25A7%25E5%25AD%25A6%25E8%2580%2581%25E5%25B8%2588%25E9%2582%25AE%25E7%25AE%25B1&rsv_pq=e1e17d5400093975&rsv_t=83fc1KipT0e6dU2l8G8651PAihzqMxhN1tT8Ue1JiKtvBGgKILwuquM4g7%2BKNKKKp6AkBxK7opGg&rqlang=cn&rsv_enter=1&rsv_dl=tb&rsv_sug3=40&rsv_sug1=4&rsv_sug7=100&rsv_sug2=0&rsv_btype=t&inputT=11395&rsv_sug4=11395")

http://www.ppmy.cn/news/675358.html

相关文章

「事件驱动架构」使用GoldenGate创建从Oracle到Kafka的CDC事件流

我们通过GoldenGate技术在Oracle DB和Kafka代理之间创建集成,该技术实时发布Kafka中的CDC事件流。 Oracle在其Oracle GoldenGate for Big Data套件中提供了一个Kafka连接处理程序,用于将CDC(更改数据捕获)事件流推送到Apache Kafka集群。 因此&#xff0…

英雄联盟胜负预测--简易肯德基上校

作为一名英雄联盟老玩家,巅峰时也曾打上过艾欧尼亚超凡大师,通过这关游戏让我认识了很多朋友,它也陪我度过了大部分校园青春。这是我第一次以学者的角度去面对它。 这里我们使用决策树ID3算法完成简易的英雄联盟胜负的预测。 AI遮天传 ML-决…

爬虫爬虫爬虫学习 day1:op.gg的爬取

学习目标: Python爬虫 学习内容: 1、 python爬虫简单语句的学习 2、 理解爬虫语句 学习产出: 1.库的导入 import requests #导入requests库 import bs4 #导入bs4库 from bs4 import BeautifulSoup #导入BeautifulSoup库2.返回html内容 r …

面板——卡片面板、常规折叠面板、手风琴折叠

1. 卡片面板 2. 普通折叠面板 3. 手风琴面板 在普通面板上加 lay-accordion"" <% page language"java" contentType"text/html; charsetUTF-8"pageEncoding"UTF-8"%> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01…

更新Edge后打开网页失败问题

Edge推荐版本更新&#xff08;Microsoft Edge版本 112.0.1722.39 (正式版本) (64 位)&#xff09;&#xff0c;更新后打开后什么也打不开&#xff0c;网页打开失败&#xff0c;设置什么都打开失败 错误截图如下&#xff1a; 网上大概都是下面几个方法&#xff1a; 1、使用兼容…

Python爬取op.gg数据——英雄联盟版本强势英雄推荐

通过爬取韩服op.gg网站的数据&#xff0c;推荐当前版本各路强势英雄&#xff08;韩服比国服先更新&#xff09;。 目录 通过爬取韩服op.gg网站的数据&#xff0c;推荐当前版本各路强势英雄&#xff08;韩服比国服先更新&#xff09;。一、op.gg源码及请求头分析二、源码分析三、…

从零开始 Spring Boot 49:Hibernate Entity Lifecycle

从零开始 Spring Boot 49&#xff1a;Hibernate Entity Lifecycle 图源&#xff1a;简书 (jianshu.com) 本文将介绍 Hibernate 的 Session 接口&#xff0c;以及如何用 Session 的相关 API 转换实体&#xff08;Entity&#xff09;的生命周期状态。 如果缺少的 JPA 和 Hiberna…

python爬取率_利用Python爬取OPGG上英雄联盟英雄胜率及选取率信息

一、分析网站内容 由网站界面可以看出&#xff0c;右侧有英雄的详细信息&#xff0c;以Garen为例&#xff0c;胜率为53.84%&#xff0c;选取率为16.99%&#xff0c;常用位置为上单 现对网页源代码进行分析(右键鼠标在菜单中即可找到查看网页源代码)。通过查找“53.84%”快速定位…