微博一级评论爬虫

server/2024/9/24 11:22:35/

cookies需要替换成自己的

import requests
import requests
from lxml import etree
import openpyxl
from concurrent.futures.thread import ThreadPoolExecutor
import re
from datetime import datetime, timedelta
from urllib import parse
from jsonpath import jsonpath
from datetime import datetime
import os
import csv
import time
import random
import logging
import colorlogdef log_init():# 创建日志器logger = logging.getLogger()logger.setLevel(logging.DEBUG)# 创建控制台输出器sh = logging.StreamHandler()# 创建格式化器,使用colorlog设置颜色fmt = '%(log_color)s%(asctime)s %(levelname)s [%(name)s] [%(filename)s:%(lineno)d] - %(message)s%(reset)s'formatter = colorlog.ColoredFormatter(fmt,log_colors={'DEBUG': 'red','INFO': 'yellow','WARNING': 'green','ERROR': 'cyan','CRITICAL': 'red,bg_white',},style='%')# 把格式化器加入输出器sh.setFormatter(formatter)# 把处理器加入日志器logger.addHandler(sh)# 移除所有之前的处理器(如果有的话)for handler in logger.handlers[:]:logger.removeHandler(handler)# 添加新的处理器logger.addHandler(sh)return logger  # 返回配置好的logger实例def get_cookies():cookies_list = []return random.choice(cookies_list)def crawl(response):html = etree.HTML(response)nodes = html.xpath('//div[@action-type="feed_list_item"]')num = 0mid_list = html.xpath("//div[@class='card-wrap']/@mid")uid = ",".join(html.xpath("//div[@class='avator']/a[@target='_blank']/@href"))uid_list = re.findall(r'//weibo\.com/(\d+)' , uid)for node in nodes:try:name = node.xpath('.//a[@class="name"]/text()')[0]content = node.xpath('.//p[@node-type="feed_list_content_full"]//text()')if content == []:content = node.xpath('.//p[@node-type="feed_list_content"]//text()')# print(content)date_str = node.xpath('.//div[@class="from"]/a[1]/text()')[0].strip()forwards = node.xpath('.//div[@class="card-act"]/ul/li[1]/a//text()')comments_counts = node.xpath('.//div[@class="card-act"]/ul/li[2]/a//text()')# print(comments_counts[-1].strip())likes = node.xpath('.//div[@class="card-act"]/ul/li[3]/a//text()')if forwards[-1].strip() in ' 转发':forwards[-1] = '0'if comments_counts[-1].strip() in ' 评论':comments_counts[-1] = '0'if comments_counts[-1].strip():mid = mid_list[num]uid = uid_list[num]get_comments(mid, uid,''.join(content).strip().replace('\u200b', ''),name,key_word)num += 1if likes[2].strip() in '赞':likes[2] = '0'result = [name, date_str, forwards[-1].strip(), comments_counts[-1].strip(), likes[2].strip(),''.join(content).strip().replace('\u200b', '')]print(result)except Exception as f:print(f)def get_comments(mid, uid,content,name_au,max_id=None):global num,headersurl = "https://weibo.com/ajax/statuses/buildComments"# print(1)if max_id == None:params = {'is_reload': '1','id': mid,'is_show_bulletin': '2','is_mix': '0','count': '10','uid': uid,'fetch_level': '0','locale': 'zh-CN',}else:params = {'flow': '0','is_reload': '1','id': mid,'is_show_bulletin': '2','is_mix': '0','max_id': max_id,'count': '20','uid': uid,'fetch_level': '0','locale': 'zh-CN',}response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).json()if len(response['data']):comment_list = jsonpath(response, '$..data[*].text_raw')name_list = jsonpath(response, '$..data[*]..screen_name')time_str_list = jsonpath(response, '$..data[*].created_at')disable_reply_list = jsonpath(response, '$..data[*].disable_reply')  # 转发数idstr_list = jsonpath(response, '$..data[*].idstr')like_counts_list = jsonpath(response, '$..data[*].like_counts')  # 点赞数source_list = jsonpath(response, '$..data[*].source')  # 来源id_list = jsonpath(response, '$..data[*].id')rootid_list = jsonpath(response, '$..data[*].rootid')for i in range(0, len(comment_list)):comment = comment_list[i]name = name_list[i]followers_count = response['data'][i]['user']['followers_count']location = response['data'][i]['user']['location']total_number = response['data'][i]['total_number']gender = response['data'][i]['user']['gender']if gender == "m":gender = "男"else:gender = '女'try:time_str = time_str_list[i]dt = datetime.strptime(time_str, "%a %b %d %H:%M:%S %z %Y")# 格式化 datetime 对象time_str = dt.strftime("%Y-%m-%d %H:%M:%S")except:time_str = ''disable_reply = disable_reply_list[i]idstr = idstr_list[i]like_counts = like_counts_list[i]id = str(id_list[i])rootid = str(rootid_list[i])# ['标题', '发布者', 'id', 'rootid', '内容', 评论者名称' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]data_list = [content, name_au, id, rootid, comment, name, gender, time_str, followers_count, total_number,like_counts,location]save_data_to_csv(data_list)num += 1logging.info(f"{YELLOW}评论数 :{num} " + f"一级评论{data_list}")max_id = jsonpath(response, '$.max_id')[0]if max_id != 0 and response['data'] != '':get_comments(mid, uid, content, name_au, max_id)def save_data_to_xlsx(data):filename = f'李佳琪/{key_word}.xlsx'name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称' , ' 性别' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]if os.path.exists(filename):workbook = openpyxl.load_workbook(filename)sheet = workbook.activesheet.append(data)else:workbook = openpyxl.Workbook()sheet = workbook.active# 添加表头sheet.append(name_headers)sheet.append(data)# 保存 Excel 文件workbook.save(filename)def save_data_to_csv(data_list):global key_wordfilename = f'{key_word}.csv'name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称', ' 性别', '时间', '粉丝数', '评论数', '点赞数', 'IP', '居住地', ]if not os.path.isfile(filename):with open(f'{filename}', 'a', encoding='utf-8-sig', newline='')as f:csv_write = csv.DictWriter(f, fieldnames=name_headers)csv_write.writeheader()else:with open(f'{filename}', 'a', encoding='utf-8', newline='')as f:csv_write = csv.DictWriter(f, fieldnames=data_list)csv_write.writeheader()def run():url = "https://s.weibo.com/weibo"for i in range(1, page):params = {"q": f'{key_word}',"page": f"{i}","xsort": "hot","suball": "1","timescope": f"custom:'{start_time}':'{end_time}'","Refer": "g",}response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).textxml = etree.HTML(response)err_msg = ",".join(xml.xpath("//div[@class='card card-no-result s-pt20b40']/p/text()"))if '抱歉,未找到相关结果。' in err_msg:breakprint(requests.get(url, headers=headers, cookies=get_cookies(), params=params).url)crawl(response)if __name__ == '__main__':RED = '\033[31m'  # 红色WHITE = '\033[37m'  # 白色YELLOW = '\033[33m'  # 黄色num = 0headers = {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-language": "zh-CN,zh;q=0.9","cache-control": "no-cache","pragma": "no-cache","priority": "u=0, i","referer": "https://weibo.com/","sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "same-site","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"}key_word = '#邯郸初中生被害案3人被刑事追诉#'start_time = '2024-4-1-1'end_time = '2024-4-20-1'page = 10  # 页数log_init()run()

http://www.ppmy.cn/server/32285.html

相关文章

Django框架之模板层

一、模版语法 1、模版初识 (1)语法 {{ }}: 变量相关 {% %}: 逻辑相关 (2)变量 ① 传值 在Django的模板语言中按此语法使用: {{ 变量名 }}。 当模版引擎遇到一个变量,它将计算这个变量,然…

对命令模式的理解

目录 一、场景1、文本编辑器并不是一个好的例子,设备控制器才是2、设备控制器的demo 二、不用命令模式1、代码2、问题 三、使用命令模式1、代码2、当需求变化时2.1 新增代码2.2 优点 四、进一步思考1、省略对Command的建模可以吗?2、命令模式的价值 一、…

2024五一数学建模B题思路代码与论文分析

2024五一数学建模B题完整代码和成品论文获取↓↓↓↓↓ https://www.yuque.com/u42168770/qv6z0d/gyoz9ou5upvkv6nx?singleDoc# B题 未来新城交通需求规划与可达率问题需要建立的模型和算法: 1. 图论 2. 网络流模型 3. 线性规划/整数规划 4. 组合优化 5. 随机过程 6. …

C#调用skiasharp操作并绘制图片

之前学习ViewFaceCore时采用Panel控件和GDI将图片及识别出的人脸方框和关键点绘制出来,本文将其修改为基于SKControl和SKCanvas实现相同的显示效果并支持保存为本地图片。   新建Winform项目,在Nuget包管理器中搜索并安装一下SkiaSharp和ViewFaceCore…

STM32 F103C8T6学习笔记17:类IIC通信(SMBus协议)—MLX90614红外非接触温度计

今日学习配置MLX90614红外非接触温度计 与 STM32 F103C8T6 单片机的通信 文章提供测试代码讲解、完整工程下载、测试效果图 本文需要用到的大概基础知识:1.3寸OLED配置通信显示、IIC通信、 定时器配置使用 这里就只贴出我的 OLED驱动方面的网址链接了&#xff1a…

code-server容器webpack的ws无法连接解决方法

TLDR 通过指定client的wsrul去连接ws devServer.client.webSocketURL ‘wss://<Forwarded uri>/ws’ 拓扑 1、code-server: 用于编写代码、启动webpack dev-server 服务&#xff1b;[https://<domain>:8001] 2、webpack: 用于浏览dev-server服务&#xff1b;[ht…

2011NOIP普及组真题 2. 统计单词数

线上OJ&#xff1a; 一本通&#xff1a;http://ybt.ssoier.cn:8088/problem_show.php?pid1954 核心思想 1、本题中比较单词不考虑大小写&#xff0c;所以在比较前先统一转换为小写或者大写。然后再比较即可。 2、由于 s2 会有前导空格&#xff0c;且可能单词之间的空格不止1个…

用 Go struct 不能犯的一个低级错误!

疑惑的例子 其给出的例子一如下&#xff1a; type People struct {}func main() {a : &People{}b : &People{}fmt.Println(a b) }你认为输出结果是什么呢&#xff1f; 输出结果是&#xff1a;false。 再稍加改造一下&#xff0c;例子二如下&#xff1a; type Peo…