案例2:东方财富股吧文本分析----code函数封装版
- 1.加载第三方包及全局设定
- 2.东方财富贴吧文本信息爬取
- 2.1爬取并保存数据
- 2.2读取已爬数据并进行预处理
- 3.股吧信息分析
- 3.1 热门话题分析
- 3.2 投资情绪分析
- 3.2 发帖时间分析
- 3.4 热点主题关联分析
- 3.5 用户行为分析
- 3.6 调用并运行上面3.1至3.5的函数
1.加载第三方包及全局设定
# 加载第三方包及全局设定
import warnings
warnings.filterwarnings('ignore')
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
import jieba
from wordcloud import WordCloud
from snownlp import SnowNLP, sentiment#情感分析库
import networkx as nx
%matplotlib inline
#中文字符设定
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
2.东方财富贴吧文本信息爬取
2.1爬取并保存数据
headers={"User-Agent":"添加你的User-Agent","Cookie":"添加你的Cookie"}# 获取单页股吧新闻
def get_guba_news_onePage(url):response=requests.get(url=url,headers=headers)html_content = response.textsoup = BeautifulSoup(html_content, "html.parser")guba_news = soup.find('div', {'id': 'articlelistnew'})read_num = []for read in guba_news.find_all('span', {'class': 'l1 a1'})[1:]:if '万' in read.text:read_num.append(int(float(read.text.replace('万', '')) * 10000))else:read_num.append(int(read.text))comment_num = []for comment in guba_news.find_all('span', {'class': 'l2 a2'})[1:]:if '万' in comment.text:comment_num.append(int(float(comment.text.replace('万', '')) * 10000))else:comment_num.append(int(comment.text))title_content = list(map(lambda x:x.text,guba_news.find_all('span', {'class': 'l3 a3'})[1:]))author_name = list(map(lambda x:x.text,guba_news.find_all('span', {'class': 'l4 a4'})[1:]))update_time = list(map(lambda x:'2023-'+x.text,guba_news.find_all('span', {'class': 'l5 a5'})[1:]))guba_news = pd.DataFrame({'read_num': read_num,'comment_num': comment_num,'title_content':title_content,'author_name':author_name,'update_time':update_time })return guba_newsdef get_guba_news_batch(stockID,start_page,end_page):save_file = './data/guba_news_'+stockID+f'_pages{start_page}-{end_page}'+'.csv'for page_num in range(start_page,end_page+1):time.sleep(1)url = "https://guba.eastmoney.com/list,{}_{}.html".format(stockID,page_num)if page_num==start_page:guba_news = get_guba_news_onePage(url)guba_news.to_csv(save_file, index=False)print(f'第{page_num}页数据已保存。')else:guba_news = pd.concat([pd.read_csv(save_file), get_guba_news_onePage(url)], axis=0)guba_news.to_csv(save_file, index=False)print(f'第{page_num}页数据已保存。')
调用函数爬取并保存数据
stockName = '中国平安'
stockList={'中国平安':'601318','格力电器':'000651'}
stockID = stockList[stockName]
start_page = 1
end_page = 30
get_guba_news_batch(stockID,start_page,end_page)
print("数据获取完成并保存!")
2.2读取已爬数据并进行预处理
guba_news_601318_pages1-30.csv数据下载
save_file = './data/guba_news_601318_pages1-30.csv'
guba_news = pd.read_csv(save_file)
guba_news['update_time'] = pd.to_datetime(guba_news['update_time'], format='%Y-%m-%d %H:%M:%S')
guba_news.head(5)
3.股吧信息分析
利用股吧爬取的信息,可以进行以下方面的数据分析:
1. 热门话题分析:通过统计帖子的阅读数和评论数,可以得到热门话题的排行榜,从而了解市场热点和投资者的关注度。
2. 情绪分析:通过对评论内容进行情感分析,可以了解投资者的情绪波动,帮助投资者把握市场情绪变化。
3. 时间分析:通过分析股吧帖子的发布时间和更新时间,可以了解投资者的活跃时间段,从而根据市场波动情况制定投资策略。
4. 话题关联分析:通过对帖子主题进行关联分析,可以找出不同话题之间的联系和关联,帮助投资者理解市场的复杂性和变化规律。
5. 用户行为分析:通过对用户的发帖和评论行为进行分析,可以了解投资者的行为特点,帮助股市研究人员和投资者更好地了解市场参与者的行为和态度。
3.1 热门话题分析
stopwords.txt文本下载
def hotTopicAnalyse(df,top_num=10):# 按照阅读数排序guba_news = df.sort_values(by='read_num', ascending=False)# 获取前20条数据top = guba_news.head(top_num)print('热门话题排行榜:')print(top)# 绘制阅读数和评论数的关系图plt.figure(figsize=(8, 6))sns.scatterplot(x=guba_news['read_num'], y=guba_news['comment_num'])plt.title('阅读数与评论数的关系')plt.xlabel('阅读数')plt.ylabel('评论数')plt.show()# 分词text = ' '.join(guba_news['title_content'].tolist())words = list(jieba.cut(text))# 去除停用词
# stopwords = pd.read_csv("./data/stopwords.txt",
# index_col=False,
# quoting=3,
# sep="\t",
# names=['stopword'],
# encoding='utf-8') #quoting=3全部引用stopwords = [' ','[',']',',', '。', '!', '?', '的', '了', '在', '是', '我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', '它们','今天','明天','中国','平安','都','资讯','2023']words = [w for w in words if w not in stopwords and len(w)>=2]word_count = {}for word in words:if word in word_count:word_count[word] += guba_news[guba_news['title_content'].str.contains(word)]['read_num'].sum()else:word_count[word] = guba_news[guba_news['title_content'].str.contains(word)]['read_num'].sum()# 绘制主题词词云图wordcloud = WordCloud(width=800,height=600,background_color="white",font_path="msyh.ttc")wordcloud.generate_from_frequencies(word_count)plt.imshow(wordcloud, interpolation="bilinear")plt.axis("off")plt.show()return word_count
3.2 投资情绪分析
# 投资情绪分析
def investSentimentAnalyse(df, sentiment_thres=0.5):titles = df['title_content'].tolist()sentiments = []for title in titles:s = SnowNLP(title)sentiments.append(s.sentiments)# 统计情感分布positive_num = len([sentiment for sentiment in sentiments if sentiment > sentiment_thres])negative_num = len([sentiment for sentiment in sentiments if sentiment < sentiment_thres])neutral_num = len([sentiment for sentiment in sentiments if sentiment == sentiment_thres])# 输出结果print(f'积极评论数:{positive_num},占比:{positive_num/len(sentiments):.2%}')print(f'消极评论数:{negative_num},占比:{negative_num/len(sentiments):.2%}')print(f'中性评论数:{neutral_num},占比:{neutral_num/len(sentiments):.2%}')
3.2 发帖时间分析
# 发帖时间分析
def postTimeAnalyse(df):# 统计每个小时的帖子数量post_count_by_hour = df.groupby(df['update_time'].dt.hour)['title_content'].count()# 统计每个小时的评论数量comment_count_by_hour = df.groupby(df['update_time'].dt.hour)['comment_num'].sum()# 统计每天的帖子数量post_count_by_day = df.groupby(df['update_time'].dt.date)['title_content'].count()# 统计每天的评论数量comment_count_by_day = df.groupby(df['update_time'].dt.date)['comment_num'].sum()# 可视化绘制图表fig, ax = plt.subplots(2, 2, figsize=(10, 6))post_count_by_hour.plot(ax=ax[0, 0], title='帖子数量按小时统计')comment_count_by_hour.plot(ax=ax[0, 1], title='评论数量按小时统计')post_count_by_day.plot(ax=ax[1, 0], title='帖子数量按天统计')comment_count_by_day.plot(ax=ax[1, 1], title='评论数量按天统计')plt.tight_layout()plt.show()
3.4 热点主题关联分析
# 热点主题关联分析
def topicRelationAnalyse(df,top_num=10):# 将主题进行分词df['seg_title'] = df['title_content'].apply(lambda x: ' '.join(jieba.cut(x)))text = ' '.join(df['title_content'].tolist())words = list(jieba.cut(text))# 去除停用词stopwords = [' ','[',']',',', '。', '!', '?', '的', '了', '在', '是', '我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', '它们','今天','明天','中国','平安','都','资讯','2023']words = [w for w in words if w not in stopwords and len(w)>=2]word_count={}for word in words:if word in word_count:word_count[word]+= df[df['title_content'].str.contains(word)]['read_num'].sum()else:word_count[word] = df[df['title_content'].str.contains(word)]['read_num'].sum()# 取出出现次数最多的前top_num个词汇sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)top_words = [x[0] for x in sorted_word_count[:top_num]]print(f"出现次数最多的前{top_num}个词汇:")print(top_words)df['seg_title'] = df['title_content'].apply(lambda x: ' '.join(jieba.cut(x)))# 构建图G = nx.Graph()for text in df['seg_title']:words = set(text.split())for word1 in words:if word1 in top_words:for word2 in words:if word1 != word2 and word2 in top_words:if G.has_edge(word1, word2):G[word1][word2]['weight'] += 1else:G.add_edge(word1, word2, weight=1)# 绘制图plt.figure(figsize=(10, 10))pos = nx.spring_layout(G, k=0.5)nx.draw_networkx_nodes(G, pos, node_size=2000, node_color='lightblue')nx.draw_networkx_edges(G, pos, width=[G[u][v]['weight']*0.1 for u, v in G.edges()])nx.draw_networkx_labels(G, pos, font_size=20, font_family='Microsoft YaHei')plt.axis('off')plt.show()
3.5 用户行为分析
# 用户行为分析
def userActionAnalyse(df,top_post_user_Num=10):# 统计每个作者的发帖数量、阅读数和评论数量author_post_count = df.groupby('author_name')['title_content'].count()read_count = df.groupby('author_name')['read_num'].sum()comment_count = df.groupby('author_name')['comment_num'].sum()# 合并数据df1 = pd.concat([author_post_count,read_count,comment_count], axis=1)df1.columns = ['发帖数', '阅读数','评论数']# 根据发帖数进行排序df1 = df1.sort_values(by='发帖数', ascending=False)df_topUser = df1[1:1+top_post_user_Num]# 分析每个作者的平均发帖和评论时间间隔filtered_author_df = df[df['author_name'].isin(list(df_topUser.index))]df_sorted = filtered_author_df.sort_values(['author_name', 'update_time'])df_sorted['time_interval'] = df_sorted.groupby('author_name')['update_time'].diff().dt.total_seconds()# 可视化作者发帖数量和评论数量fig, ax = plt.subplots(2, 2, figsize=(10, 10))df_topUser['发帖数'].plot(kind='bar', ax=ax[0, 0])ax[0, 0].set_title('作者发帖数')ax[0, 0].set_xlabel('作者')ax[0, 0].set_ylabel('发帖数')df_topUser['阅读数'].plot(kind='bar', ax=ax[0, 1])ax[0, 1].set_title('作者帖子阅读数')ax[0, 1].set_xlabel('作者')ax[0, 1].set_ylabel('阅读数')df_topUser['评论数'].plot(kind='bar', ax=ax[1, 0])ax[1, 0].set_title('作者帖子评论数')ax[1, 0].set_xlabel('作者')ax[1, 0].set_ylabel('评论数')# 可视化每个作者的平均发帖和评论时间间隔author_mean_time_interval = df_sorted.groupby('author_name')['time_interval'].mean().dropna()author_mean_time_interval.plot(kind='bar', ax=ax[1, 1])ax[1, 1].set_title('作者发帖的平均时间间隔')ax[1, 1].set_xlabel('作者')ax[1, 1].set_ylabel('时间间隔 (秒)')plt.tight_layout()plt.show()return df1
3.6 调用并运行上面3.1至3.5的函数
guba_news_601318_pages1-30.csv数据下载
if __name__ == "__main__":
# 读取数据save_file = './data/guba_news_601318_pages1-30.csv'#注意与自己数据文件的路径一致df = pd.read_csv(save_file)df['update_time'] = pd.to_datetime(df['update_time'], format='%Y-%m-%d %H:%M:%S')
# 热门话题分析hotTopicAnalyse(df)
# 投资情绪分析investSentimentAnalyse(df)
# 发帖时间分析postTimeAnalyse(df)
# 热门主题关联分析topicRelationAnalyse(df)
# 用户行为分析userActionAnalyse(df)