【17-3】Twitter评论情绪分类实战

embedded/2025/3/16 9:07:48/

139-Twitter评论情绪基础RNN模型分类

143-LSTM文本分类模型

【参考文档】17-3Twitter评论情绪分类.ipynb

【导出代码】

# %% [markdown]
# # 139-Twitter评论情绪分类# %% [markdown]
# ## 数据读取处理# %%
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd# %%
data = pd.read_csv('Tweets.csv')# %%
data.head()# %% [markdown]
# 取两列:评价,文本# %%
data = data[['airline_sentiment', 'text']]# %%
data# %% [markdown]
# 查看评价唯一值# %%
data.airline_sentiment.unique()# %%
data.info()     #【查看缺失值】# %%
data.duplicated().sum()     #【查看重复值】# %%
data.drop_duplicates(inplace=True)      #【去掉重复值】# %%
data.airline_sentiment.value_counts()       #【查看各种类数量】# %% [markdown]
# 情绪改为0,1,2进行编码# %%
label = pd.factorize(data.airline_sentiment)[0]# %% [markdown]
# 简化文本:0-转化为小写、1-去掉特殊符号# %%
data.text# %%
import re
pat = re.compile('[A-Za-z]+')# %%
#【文本处理函数】
def pre_text(text):text = pat.findall(text)        #【提取所有英文】text = [w.lower() for w in text]        #【转化为小写】return text # %%
x = data.text.apply(pre_text)  #【应用函数】# %%
x# %% [markdown]
# ## 创建词表
# vocab:每个单词创建一个序号# %%
word_set = set()
for t in x:     #【x:文本列表集合,t:每条文本列表】for word in t:word_set.add(word)# %%
word_set        #【所有的唯一单词】# %%
max_word = len(word_set)+1# %%
word_list = list(word_set)      #【字典转列表】# %%
word_list.index('you')# %%
word_index = dict((w, word_list.index(w)+1) for w in word_list)     #【列表推导式,直接输出字典】# %%
word_index# %%
x = x.apply(lambda t: [word_index.get(w, 0) for w in t])        #【将单词w从文本t取出,根据index转化为编码,最后应用导x上】# %%
x# %%
max_len = max(len(t) for t in x)# %%
max_len# %%
pad_x = [t + (max_len-len(t))*[0] for t in x]      #【填充文本长度到最大:最大长度-当前长度*列表[0],最后加到当前文本t后】# %%
pad_x = np.array(pad_x)     #【转化列表】# %%
pad_x.shape     #【14452条,长度都是34】# %%
label.shape# %% [markdown]
# ## 划分训练测试数据# %% [markdown]
# pip install sklearn:机器学习库# %%
from sklearn.model_selection import train_test_split# %%
x_train, x_test, y_train, y_test = train_test_split(pad_x, label)# %%
x_train.shape, x_test.shape# %% [markdown]
# 创建DataSet类# %%
class Mydataset(torch.utils.data.Dataset):def __init__(self, text_array, label_array):self.text_array = text_arrayself.label_array = label_arraydef __getitem__(self, index):text = torch.LongTensor(self.text_array[index])label = self.label_array[index]return text, labeldef __len__(self):return len(self.label_array)# %%
train_ds = Mydataset(x_train, y_train)
test_ds = Mydataset(x_test, y_test)# %%
BATCH_SIZE = 32# %%
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)# %% [markdown]
# ## 基础文本分类模型# %%
embedding_dim = 100# %%
#【没用考虑时序关系,只考虑了单句话】
class Basic_Net(nn.Module):def __init__(self):super(Basic_Net, self).__init__()#【嵌入层】self.embedding = nn.Embedding(num_embeddings=max_word, embedding_dim=embedding_dim)     #【文本标记为enmbedding——dim张量】#【Linear层】self.fc1 = nn.Linear(max_len*100, 1024)#【输出层】self.fc2 = nn.Linear(1024, 3)def forward(self, x):x = self.embedding(x)x = x.view(x.size(0), -1)    #【将embedding——dim张量展开为1维张量】x = F.relu(self.fc1(x))x = self.fc2(x)         #【输出层:不需要激活函数】return x# %%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Basic_Net().to(device)# %%
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)# %%
def fit(epoch, model, train_dl, test_dl):crrect = 0total = 0running_loss = 0.0model.train()for x, y in train_dl:x, y = x.to(device), y.to(device)y_pred = model(x)loss = loss_fn(y_pred, y)optimizer.zero_grad()loss.backward()optimizer.step()with torch.no_grad():y_pred = torch.argmax(y_pred, dim=1)crrect += (y_pred == y).sum().item()total += y.size(0)running_loss += loss.item()epoch_loss = running_loss / len(train_dl.dataset)epoch_acc = crrect / totaltest_correct = 0test_total = 0test_running_loss = 0.0model.eval()with torch.no_grad():for x, y in test_dl:x, y = x.to(device), y.to(device)y_pred = model(x)loss = loss_fn(y_pred, y)y_pred = torch.argmax(y_pred, dim=1)test_correct += (y_pred == y).sum().item()test_total += y.size(0)test_running_loss += loss.item()test_loss = test_running_loss / len(test_dl.dataset)test_acc = test_correct / test_totalprint(f"Epoch {epoch+1} loss: {epoch_loss:.4f} acc: {epoch_acc:.4f} | Test loss: {test_loss:.4f} acc: {test_acc:.4f}")return epoch_loss, epoch_acc, test_loss, test_acc# %%
epochs = 10# %%
train_loss = []
train_acc = []
test_loss = []
test_acc = []for epoch in range(epochs):epoch_loss, epoch_acc, epoch_test_loss, epoch_test_acc = fit(epoch, model, train_dl, test_dl)train_loss.append(epoch_loss)train_acc.append(epoch_acc)test_loss.append(epoch_test_loss)test_acc.append(epoch_test_acc)# %% [markdown]
# 严重的过拟合# %% [markdown]
# # 143-LSTM文本分类模型# %%
embedding_dim = 100
hidden_size = 200# %%
class LSTM_Net(nn.Module):def __init__(self,max_word, embedding_dim):super(LSTM_Net, self).__init__()self.embedding = nn.Embedding(max_word, embedding_dim)      #【batch * maxlen * embedding_dim】self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)self.fc1 = nn.Linear(hidden_size, 256)self.fc2 = nn.Linear(256, 3)def forward(self, x):x = self.embedding(x)x, _ = self.lstm(x)     #【x -->  batch, time_step, output】x = x[:, -1, :]x = F.relu(self.fc1(x))x = self.fc2(x)return x# %%
model = LSTM_Net(max_word, embedding_dim).to(device)# %%
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)def fit(epoch, model, train_dl, test_dl):crrect = 0total = 0running_loss = 0.0model.train()for x, y in train_dl:x, y = x.to(device), y.to(device)y_pred = model(x)loss = loss_fn(y_pred, y)optimizer.zero_grad()loss.backward()optimizer.step()with torch.no_grad():y_pred = torch.argmax(y_pred, dim=1)crrect += (y_pred == y).sum().item()total += y.size(0)running_loss += loss.item()epoch_loss = running_loss / len(train_dl.dataset)epoch_acc = crrect / totaltest_correct = 0test_total = 0test_running_loss = 0.0model.eval()with torch.no_grad():for x, y in test_dl:x, y = x.to(device), y.to(device)y_pred = model(x)loss = loss_fn(y_pred, y)y_pred = torch.argmax(y_pred, dim=1)test_correct += (y_pred == y).sum().item()test_total += y.size(0)test_running_loss += loss.item()test_loss = test_running_loss / len(test_dl.dataset)test_acc = test_correct / test_totalprint(f"Epoch {epoch+1} loss: {epoch_loss:.4f} acc: {epoch_acc:.4f} | Test loss: {test_loss:.4f} acc: {test_acc:.4f}")return epoch_loss, epoch_acc, test_loss, test_accepochs = 10
train_loss = []
train_acc = []
test_loss = []
test_acc = []for epoch in range(epochs):epoch_loss, epoch_acc, epoch_test_loss, epoch_test_acc = fit(epoch, model, train_dl, test_dl)train_loss.append(epoch_loss)train_acc.append(epoch_acc)test_loss.append(epoch_test_loss)test_acc.append(epoch_test_acc)# %%
import matplotlib.pyplot as plt# %%
plt.plot(range(epochs), train_acc, c='r', label='Training Accuracy')
plt.plot(range(epochs), test_acc, c='b', label='Test Accuracy')
plt.title('Training and Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()# %%


http://www.ppmy.cn/embedded/173022.html

相关文章

ARM64 架构地址空间分配深度解析

一、寻址空间选择的技术逻辑(基于 ARMv8 架构) 地址空间截断的工程实现(LPAE 技术) 在计算架构设计中,ARM64架构选择使用48位/52位虚拟地址空间而非完整的64位寻址,这一决策体现了硬件设计者在性能、功耗…

【A2DP】深入解读A2DP中通用访问配置文件(GAP)的互操作性要求

目录 一、模式支持要求 1.1 发现模式 1.2 连接模式 1.3 绑定模式 1.4 模式间依赖关系总结 1.5 注意事项 1.6 协议设计深层逻辑 二、安全机制(Security Aspects) 三、空闲模式操作(Idle Mode Procedures) 3.1 支持要求 …

Python 逆向工程:2025 年能破解什么?

有没有想过在复杂的软件上扭转局面?到 2025 年,Python 逆向工程不仅仅是黑客的游戏,它是开发人员、安全专业人员和好奇心强的人解开编译代码背后秘密的强大方法。无论您是在剖析恶意软件、分析 Python 应用程序的工作原理,还是学习…

多线程到底重不重要?

我们先说一下为什么要讲多线程和高并发? 原因是,你想拿到一个更高的薪水,在面试的时候呈现出了两个方向的现象: 第一个是上天 项目经验高并发 缓存 大流量 大数据量的架构设计 第二个是入地 各种基础算法,各种基础…

C/C++蓝桥杯算法真题打卡(Day4)

一、P11041 [蓝桥杯 2024 省 Java B] 报数游戏 - 洛谷 算法代码&#xff1a; #include<bits/stdc.h> using namespace std;// 计算第 n 个满足条件的数 long long findNthNumber(long long n) {long long low 1, high 1e18; // 二分查找范围while (low < high) {lo…

微软为何选择用Go而非Rust重写TypeScript

最近&#xff0c; TypeScript 宣布用 Go 语言全面重写 TypeScript。重写后的ts在某些测试中实现了 10 倍的速度提升(例如对于VS Code项目)&#xff0c;有的甚至高达 15 倍。 A 10x Faster TypeScript 短短几天,其官方库 typescript-go star数超过了1.4万,各种文章纷至沓来. 但同…

JavaScript相关面试题

以下是150道JavaScript相关面试题及详细答案&#xff1a; JavaScript基础 1.JavaScript是什么&#xff1f; JavaScript是一种直译式脚本语言&#xff0c;主要用于网页开发&#xff0c;也可用于服务器端开发&#xff08;如Node.js&#xff09;。它是一种动态类型、弱类型、基于原…

Flutter笔记

✔️ 安装Flutter SDK 1. 下载地址 下载地址 下载完成之后,解压软件包 2. 配置 Flutter 的环境变量 打开环境配置, 找到 Path&#xff0c;在其中添加 Flutter SDK目录下bin目录 在终端中执行 flutter --version&#xff0c;出现如下内容&#xff0c;说明安装flutter成功 …