python爬取网易云音乐生成王力宏歌曲词云
import requests
import sys,re,os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
from PIL import Image
import numpy as np
from lxml import etree headers = {'Referer' :'http://music.163.com','Host' : 'music.163.com','Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','User-Agent' : 'Chrome/10'
}
def get_song_lyric(headers, lyric_url):res = requests.request('GET', lyric_url,headers=headers)if 'lrc' in res.json():lyric = res.json()['lrc']['lyric']new_lyric = re.sub(r'[\d:.[\]]','',lyric)return new_lyricelse:return ''print(res.json())
def remove_stop_words(f):stop_words = ['作词','作曲', '编曲', 'Arranger', '录音', '混音', '人声', 'Vocal', '弦乐', 'Keyboard', '键盘', '编辑', '助理', 'Assistants', 'Mixing', 'Editing', 'Recording', '音乐', '制作', 'Producer', '发行', 'produced', 'and', 'distributed']for stop_word in stop_words:f = f.replace(stop_word, '')return f
def create_word_cloud(f):print('根据词频,开始生成词云!')f = remove_stop_words(f)cut_text = ' '.join(jieba.cut(f, cut_all=False, HMM=True))wc = WordCloud(font_path = './wc.ttf',max_words = 100,width = 2000,height = 1200,)print(cut_text)wordcloud = wc.generate(cut_text)wordcloud.to_file('wanglihong_wordcloud.jpg')plt.imshow(wordcloud)plt.axis('off')plt.show()
def get_songs(artist_id):page_url = 'https://music.163.com/artist?id=' + artist_idres = requests.request('GET', page_url, headers=headers)html = etree.HTML(res.text)href_xpath = "//*[@id='hotsong-list']//a/@href"name_xpath = "//*[@id='hotsong-list']//a/text()"hrefs = html.xpath(href_xpath)names = html.xpath(name_xpath)song_ids = []song_names = []for href, name in zip(hrefs, names):song_ids.append(href[9:])song_names.append(name)print(href, ' ', name)return song_ids, song_names
artist_id = '5346'
[song_ids, song_names] = get_songs(artist_id)
all_word = ''
for (song_id, song_name) in zip(song_ids, song_names):lyric_url = 'http://music.163.com/api/song/lyric?os=pc&id=' + song_id + '&lv=-1&kv=-1&tv=-1'lyric = get_song_lyric(headers, lyric_url)all_word = all_word + ' ' + lyricprint(song_name)
create_word_cloud(all_word)