1.
这几天一直想做个网站,关于网易云的,就是爬取网易云评论来进行展示。有了想法,就立马行动起来。我的准备是用python来爬取网易云评论。然后自己写个前端,进行展示。不需要太复杂。就是一个简单的页面就行了。通过js向我的后端发请求来获取评论。后端用flask来写一个api。随机返回十几个评论。想着蛮简单的,但是,做起来就遇到了不少困难。
困难1.
我对css和JavaScript就不是很熟,很多东西都不会。开始写页面的时候,就想着一个简单的页面而已,不会太难。但是当你自己真的来写的时候,还是有点困难的。这个困难怎么说呢?。页面是写出来了。但是,但是。太TM丑了。虽然我也是实用主义者。但是还是看不下去。我其实想模仿着b站上一个up主做的。后面背景图片用的一个GIF图。这样更好看一点。但是我的GIF不能覆盖整个背景,只能覆盖一半。那怎么办?没办法,只能一半一半了。害!
页面好看不是评判一个网站的好坏的根本标准,但是一个恶心人的页面一定不是好页面。
——鲁迅
困难2.
页面写好了,就要准备功能了。我准备用python开始写爬虫了。按照正常程序来,打开网易云主页,轻轻按下F12,开始找评论的请求,以我的技术,没一会就找到了。一个叫get?csrf_token=的请求。打开一看,没错里面就是我要找的评论。
一切顺利!
是个post请求,我就慢慢的往下移动,看看这个post要的数据,但是当我看到这个请求数据,我就感觉不妙,有一股淡淡的"杀气"
这么长的东西,是个什么鬼???这直接给我整emo了。
没办法,只有求助搜索引擎了。一百度,嘿,还真有,还蛮多。在大佬们的博客畅游了好久之后。总算知道了网易云的评论加密方式了。由开始的直接直接懵逼,到后来慢慢有点头绪。先主要介绍怎么加密的吧。
function a(a) {var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";for (d = 0; a > d; d += 1)e = Math.random() * b.length,e = Math.floor(e),c += b.charAt(e);return c}//这个a的函数的就是随机生成一个16位的整数function b(a, b) {var c = CryptoJS.enc.Utf8.parse(b), d = CryptoJS.enc.Utf8.parse("0102030405060708"), e = CryptoJS.enc.Utf8.parse(a), f = CryptoJS.AES.encrypt(e, c, {iv: d,mode: CryptoJS.mode.CBC});return f.toString()//b函数的意思就是给传入的参数进行aes加密}function c(a, b, c) {var d, e;return setMaxDigits(131),d = new RSAKeyPair(b,"",c),e = encryptedString(d, a)}//c函数和encSecKey生成有关function d(d, e, f, g) {var h = {}, i = a(16);return h.encText = b(d, g),h.encText = b(h.encText, i),h.encSecKey = c(i, e, f),h}//d就是给再来一次aes加密function e(a, b, d, e) {var f = {};return f.encText = c(a + e, b, d),f}window.asrsea = d,
window.ecnonasr = e
}();
(function() {var c7f = NEJ.P, eu7n = c7f("nej.g"), u7n = c7f("nej.j"), j7c = c7f("nej.u"), Tu9l = c7f("nm.x.ek");Tu9l.emj = {"色": "00e0b","流感": "509f6","这边": "259df","弱": "8642d","嘴唇": "bc356","亲": "62901","开心": "477df","呲牙": "22677","憨笑": "ec152","猫": "b5ff6","皱眉": "8ace6","幽灵": "15bb7","蛋糕": "b7251","发怒": "52b3a","大哭": "b17a8","兔子": "76aea","星星": "8a5aa","钟情": "76d2e","牵手": "41762","公鸡": "9ec4e","爱意": "e341f","禁止": "56135","狗": "fccf6","亲亲": "95280","叉": "104e0","礼物": "312ec","晕": "bda92","呆": "557c9","生病": "38701","钻石": "14af6","拜": "c9d05","怒": "c4f7f","示爱": "0c368","汗": "5b7a4","小鸡": "6bee2","痛苦": "55932","撇嘴": "575cc","惶恐": "e10b4","口罩": "24d81","吐舌": "3cfe4","心碎": "875d3","生气": "e8204","可爱": "7b97d","鬼脸": "def52","跳舞": "741d5","男孩": "46b8e","奸笑": "289dc","猪": "6935b","圈": "3ece0","便便": "462db","外星": "0a22b","圣诞": "8e7","流泪": "01000","强": "1","爱心": "0CoJU","女孩": "m6Qyw","惊恐": "8W8ju","大笑": "d"};Tu9l.md = ["色", "流感", "这边", "弱", "嘴唇", "亲", "开心", "呲牙", "憨笑", "猫", "皱眉", "幽灵", "蛋糕", "发怒", "大哭", "兔子", "星星", "钟情", "牵手", "公鸡", "爱意", "禁止", "狗", "亲亲", "叉", "礼物", "晕", "呆", "生病", "钻石", "拜", "怒", "示爱", "汗", "小鸡", "痛苦", "撇嘴", "惶恐", "口罩", "吐舌", "心碎", "生气", "可爱", "鬼脸", "跳舞", "男孩", "奸笑", "猪", "圈", "便便", "外星", "圣诞"]
}//window.asrsea其中的一个参数,是个定值
)();
(function() {var c7f = NEJ.P, eu7n = c7f("nej.g"), u7n = c7f("nej.j"), j7c = c7f("nej.u"), Tu9l = c7f("nm.x.ek"), l7e = c7f("nm.x");if (u7n.be8W.redefine)return;window.GEnc = true;var bva4e = function(chr0x) {var m7f = [];j7c.bf8X(chr0x, function(chh0x) {m7f.push(Tu9l.emj[chh0x])});return m7f.join("")};var cha0x = u7n.be8W;u7n.be8W = function(Y8Q, e7d) {var i7b = {}, e7d = NEJ.X({}, e7d), mh9Y = Y8Q.indexOf("?");if (window.GEnc && /(^|\.com)\/api/.test(Y8Q) && !(e7d.headers && e7d.headers[eu7n.BJ4N] == eu7n.FN3x) && !e7d.noEnc) {if (mh9Y != -1) {i7b = j7c.hh8Z(Y8Q.substring(mh9Y + 1));Y8Q = Y8Q.substring(0, mh9Y)}if (e7d.query) {i7b = NEJ.X(i7b, j7c.fQ7J(e7d.query) ? j7c.hh8Z(e7d.query) : e7d.query)}if (e7d.data) {i7b = NEJ.X(i7b, j7c.fQ7J(e7d.data) ? j7c.hh8Z(e7d.data) : e7d.data)}i7b["csrf_token"] = u7n.gU8M("__csrf");Y8Q = Y8Q.replace("api", "weapi");e7d.method = "post";delete e7d.query;var bKf7Y = window.asrsea(JSON.stringify(i7b), bva4e(["流泪", "强"]), bva4e(Tu9l.md), bva4e(["爱心", "女孩", "惊恐", "大笑"]));//进行加密e7d.data = j7c.cs8k({params: bKf7Y.encText,encSecKey: bKf7Y.encSecKey})}
这个看的代码看的肯定有点长,但是关键代码也不多,我都有注释。我第一次看到这个也是一脸懵,怎么还有[爱心", "女孩", "惊恐", "大笑"],这都是什么鬼?后来我才知道,这就是来迷惑你的。上面有个字典,每一个词语都对应几个字母。后面用的是字母来当参数加密。经过我的测试,这个window.asrsea后面两个参数都是定值,你们可以在window.asrsea设个断点,然后在控制台输出这些值看看。你会发现,bva4e(["流泪", "强"])是'010001',bva4e(Tu9l.md)是 00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7。bva4e(["爱心", "女孩", "惊恐", "大笑"])是0CoJUm6Qyw8W8jud。
那么现在就只剩下一个参数了,那就是JSON.stringify(i7b)了。你可以试着也在控制台输出一下。你可以会发现是'{"csrf_token":""}'。这个可能是断点的问题,你换个断点你就会发现。i7b是这个。但是你的可能不叫i7b。你自己找找。
{'csrf_token': "",'cursor': "-1",'offset': "0",'orderType': "1",'pageNo': "1",'pageSize': 20,'rid': "R_SO_4_1376142151",'threadId': "R_SO_4_1376142151"}
可以发现rid和threadId的R_SO_4_后面跟的都是歌曲id。csrf_token没啥用,就让他空着。唯一变的就是pageSize。它一直都是20的倍数。这样就好解决了。现在只要搞清楚怎么加密就行了。我们慢慢顺着往上找window.asrsea,发现window.asrsea等于一个叫d的函数再往上找,你会发现一个叫b的函数,再往上,你又会找到一个叫a和c的函数。b的功能就是对传入的参数进行AES加密。a的作用就是随机生成一个函数。c的函数和encSecKey生成有关。后面你会发现对于一个固定的i会有一个固定encSecKey。所以我们只要用一个i生成的encSecKey到时候就可以一直用了。那么加密就是,a生成一个16位随机的整数,d也就是window.asrsea,先用第一个和最后一个参数进行一次aes加密生成w,然后又用w作为参数和a生成的16整数进行二次aes加密生成params。这个网易云呀!!太可恶了,加密这么多,可折磨死我了。
那么下面就直接上代码吧
from base64 import b64encode
import requests,jsone = "010001"
f = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
g = "0CoJUm6Qyw8W8jud"
i = "0hyFaCNAVzOIdoht"url = 'https://music.163.com/weapi/comment/resource/comments/get?csrf_token='
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}def get_encSecKey():return "4022359ea3110bcd034e0160c3b89e5e172fd0110a3cf765d9f366d9fd09840a1f4a4705ac43719fdb8bfeb44d3b92334733061ad10942131184a4dfba0ac9d2cf867b8b6236523c1ca5f44c0d2d82c1c2665a3137a9241c7373539c1aa8e5e9bb9d33dafc764b5d76c2ab34fc94df85e27a934c8a603fa713f2cf38c2b7bbae"def get_params(data): #data默认是json字符串first = enc_params(data,g)second = enc_params(first,i)return seconddef to_16(data):pad = 16-len(data)%16data +=chr(pad) * padreturn datadef enc_params(data,key): #加密过程iv = "0102030405060708"data = to_16(data)aes = AES.new(key=key.encode('utf-8'),IV=iv.encode('utf-8'),mode=AES.MODE_CBC) #创建加密器bs = aes.encrypt(data.encode('utf-8')) #加密return str(b64encode(bs),"utf-8") #转化成字符串if __name__ == '__main__':page = int(input('请输入需要爬取的页数:'))print('开始爬虫!!!')for j in range(1,page+1):page_num = str(j*20)data = {'csrf_token': "",'cursor': "-1",'offset': "0",'orderType': "1",'pageNo': "1",'pageSize': page_num,'rid': "R_SO_4_1376142151",'threadId': "R_SO_4_1376142151"}response = requests.post(url,data={"params":get_params(json.dumps(data)),"encSecKey":get_encSecKey()},headers=headers)result = json.loads(response.content.decode('utf-8'))i = random.randint(0,20)print(result['data']['comments'][i]['content'])print('爬取完毕!!!')
困难3.
那就是JavaScript发送request请求一直不成功,百度后说是请求域的问题,有点麻烦。等下次有时间把这个解决了吧。
今天就到这里吧!