Python爬虫使用示例-古诗词摘录

ops/2024/10/18 0:30:18/

一、分析需求

目标地址:

https://www.sou-yun.cn/Query.aspx?type=poem&id=×××××

在这里插入图片描述


在这里插入图片描述

二、提取诗句

python">import os
import re
import requests
import parsel#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 正则表达式匹配
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)# 清理并输出提取的诗句
for sentence in poem_sentences:# 移除HTML标签clean_sentence = re.sub(r"<.*?>", "", sentence).strip()if clean_sentence:  # 过滤掉空句print(clean_sentence)

三、其他信息

提取all需要信息,title+author+sentences

python">
import os
import re
import requests
import parsel#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 提取标题
title_match = re.search(r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>", html_content)
if title_match:title = title_match.group(1) + title_match.group(2)  # 合并标题部分author = re.sub(r"<.*?>", "", title_match.group(3)).strip()  # 处理作者# 正则表达式匹配诗句
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)# 清理并输出提取的信息
print("标题:", title)
print("作者:", author)
print("诗句:")for sentence in poem_sentences:# 移除HTML标签clean_sentence = re.sub(r"<.*?>", "", sentence).strip()if clean_sentence:  # 过滤掉空句print(clean_sentence)

在这里插入图片描述
微调格式

python">import os
import re
import requests
import parsel#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 提取标题
title_match = re.search(r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>", html_content)
if title_match:title = title_match.group(1) + title_match.group(2)  # 合并标题部分author = re.sub(r"<.*?>", "", title_match.group(3)).strip()  # 处理作者# 正则表达式匹配诗句
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)# 清理并输出提取的信息
print("《 " + title + "》 ("+ author + ")")
#print("作者:", author)
#print("诗句:")for sentence in poem_sentences:# 移除HTML标签clean_sentence = re.sub(r"<.*?>", "", sentence).strip()if clean_sentence:  # 过滤掉空句print(clean_sentence)

四、保存文档

保存到txt里面,单首诗歌

python">import os
import re
import requests
import parsel#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 提取标题
title_match = re.search(r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>", html_content)
if title_match:title = title_match.group(1) + title_match.group(2)  # 合并标题部分author = re.sub(r"<.*?>", "", title_match.group(3)).strip()  # 处理作者# 正则表达式匹配诗句
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)# 清理并准备写入文件的内容
output = f"《 " + title + "》 ("+ author + ")\n"
print("《 " + title + "》 ("+ author + ")")for sentence in poem_sentences:# 移除HTML标签clean_sentence = re.sub(r"<.*?>", "", sentence).strip()if clean_sentence:  # 过滤掉空句output += clean_sentence + "\n"print(clean_sentence)# 将结果写入文本文件with open('poem.txt', 'w', encoding='utf-8') as file:file.write(output)print("信息已保存到 poem.txt")

五、多首继续

不一定是符合要求的,因为这个id暂时得不到(内容结构问题)
在这里插入图片描述


找不到,因为是按照第一个写的的正则

python">import os
import re
import requests
import parsel#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
#url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
#headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
#response = requests.get(url=url,headers=headers)
#html_content= response.text
#print(response.text)# 指定保存文件的路径
output_file_path = 'all_poems.txt'# 先清空(如果存在)或创建目标文件
with open(output_file_path, 'w', encoding='utf-8') as file:file.write("")  # 清空文件内容# 循环下载每首诗
for poem_id in range(36647, 36848):url = f'https://www.sou-yun.cn/Query.aspx?type=poem1&id={poem_id}'headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}response = requests.get(url=url, headers=headers)#html_content = response.text# 获取网页内容#response = requests.get(url)if response.status_code == 200:html_content = response.text# 提取标题title_match = re.search(r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>",html_content)if title_match:title = title_match.group(1) + title_match.group(2)  # 合并标题部分author = re.sub(r"<.*?>", "", title_match.group(3)).strip()  # 处理作者# 正则表达式匹配诗句poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)# 清理并准备写入文件的内容output = f"《 " + title + "》 ("+ author + ")\n"for sentence in poem_sentences:# 移除HTML标签clean_sentence = re.sub(r"<.*?>", "", sentence).strip()if clean_sentence:  # 过滤掉空句output += clean_sentence + "\n"# 为每首诗添加分隔线output += "\n" + "=" * 50 + "\n\n"  # 分隔线,用于区分不同的诗# 将结果追加到文本文件with open(output_file_path, 'a', encoding='utf-8') as file:  # 以追加模式打开文件file.write(output)print(f"信息已保存到 {output_file_path}")else:print(f"在ID {poem_id} 的页面中找不到诗的标题或作者。")else:print(f"无法获取ID {poem_id} 的页面,状态码: {response.status_code}")

运行结果:
在这里插入图片描述
在这里插入图片描述


http://www.ppmy.cn/ops/126333.html

相关文章

【Vue】Vue(八)Vue3.0 使用ref 和 reactive创建响应式数据

ref 创建&#xff1a;基本类型的响应式数据 **作用&#xff1a;**定义响应式变量。语法&#xff1a;let xxx ref(初始值)。**返回值&#xff1a;**一个RefImpl的实例对象&#xff0c;简称ref对象或ref&#xff0c;ref对象的value属性是响应式的。注意点&#xff1a; JS中操作…

RHCSA复习题

第一~七章 1.创建以下目录和文件结构&#xff0c;并将/yasuo目录拷贝4份到/目录下。 [rootlocalhost ~]# mkdir /yasuo [rootlocalhost ~]# mkdir /yasuo/dir1 [rootlocalhost ~]# cd /yasuo/dir1 [rootlocalhost dir1]# touch hostname hostname02 passwd ssh_config sshd [r…

linux下使用systemctl设置开机自动运行程序

本文介绍在Linux下&#xff0c;使用systemctl设置开机自动运行程序&#xff0c;实现创建一个systemd服务单元文件&#xff0c;并启用该服务的方法。 1、创建.service文件 在/etc/systemd/system/目录下创建一个以.service结尾的文件&#xff0c;例如myapp.service&#xff1a…

文本生成视频技术:艺术与科学的交汇点

在人工智能技术的飞速发展下&#xff0c;文本生成视频&#xff08;Text-to-Video&#xff09;技术已经成为现实。这项技术能够根据文本描述生成相应的视频内容&#xff0c;极大地拓展了内容创作的边界。本文将从三个主要方面对文本生成视频技术进行深入探讨&#xff1a;技术能达…

请确保已在git上配置你的user.name和user.email

问题&#xff1a;使用vscode在远程服务器上暂存修改报错&#xff1a; 原因&#xff1a;未在远程服务器上配置该项目对应的git的username和useremail 解决方法&#xff1a; 在vscode中新建一个终端 命名&#xff1a; git config --global user.email "youexample.com&qu…

python爬虫实战案例——从移动端接口抓取微博评论,采用cookie登陆,数据存入excel表格,超详细(15)

文章目录 1、任务目标2、网页分析3、代码编写3.1 代码分析3.2 完整代码1、任务目标 1、目标网站:微博文章(https://m.weibo.cn/detail/4813628149072458),这是微博某一篇博文,用于本文测试 2、要求:爬取该博文下,所有一级评论和二级评论,以及每条评论的作者,最后保存至E…

Spring AI Java程序员的AI之Spring AI(一)

SpringAI 基础使用 前言Spring AIChatClientImageClientOpenAiAudioTranscriptionClientEmbeddingClient 总结 前言 Spring AI&#xff0c;听着名字就感觉很好使用&#xff0c;快速上手&#xff0c;虽然功能没有太完善&#xff0c;但是社区活跃度很高&#xff0c;可以看看源码…

PG 17 增量备份功能介绍

背景 PG 17 新增了增量备份功能&#xff0c;可以通过 pg_basebackup --incrementalPATH_TO_MANIFEST 命令进行增量备份。 官方文档&#xff1a;https://www.postgresql.org/docs/current/app-pgbasebackup.html 在先前版本&#xff0c;其实我们也可以利用 WAL 进行增量备份&…