这是一篇让人脸红的python数据分析,嘿嘿嘿嘿嘿( 四 )


词云
path = 'C:/Users/zbd/Desktop/Amazon/fenci/'# 读取文件、分词def get_text():f = open(path+'reviews.txt','r',encoding = 'utf-8')text = f.read().lower()# 统一改为小写for i in '!@#$%^&*()_ˉ+-;:`~\'"<>=./?,':# 替换英文符号为空格text = text.replace(i,'') return text.split()# 返回分词结果lst_1= get_text()# 分词print('总共有{}个词'.format(len(lst_1)))# 统计总词数# 去除stop_word(常见词)stop_word_text = open(path+'stop_word.txt','r',encoding = 'utf-8')# 读取下载的stop_word表stop_word = stop_word_text.read().split()stop_word_add = ['a','i','im','it鈥檚','i鈥檓','\\u0026','5鈥','reviewdate']# 可在该列表中继续添加stop_wordstop_word_new = stop_word + stop_word_add #print(stop_word_new)lst_2 =list(word for word in lst_1 if word not in stop_word_new)print('去除后总共有{}个词'.format(len(lst_2)))# 统计词频counts = {}for i in lst_2:counts[i] = counts.get(i,0) + 1#print(counts)word_counts = list(counts.items())#print(word_counts)word_counts.sort(key = lambda x:x[1],reverse = True)# 按词频降序排列# 输出结果for i in word_counts[0:50]:print(i)# 制作词云from scipy.misc import imreadimport matplotlib.pyplot as pltimport jiebafrom wordcloud import WordCloud, ImageColorGeneratorstopwords = {}# isCN = 0 # 0:英文分词1:中文分词path = 'C:/Users/zbd/Desktop/Amazon/fenci/'back_coloring_path = path + 'img.jpg'# 设置背景图片路径text_path = path +'reviews.txt'# 设置要分析的文本路径stopwords_path = path + 'stop_word.txt'# 停用词词表imgname1 = path + 'WordCloudDefautColors.png'# 保存的图片名字1(只按照背景图片形状) imgname2 = path + 'WordCloudColorsByImg.png'# 保存的图片名字2(颜色按照背景图片颜色布局生成)#font_path = r'./fonts\simkai.ttf'# 为matplotlib设置中文字体路径----- 主要是中文时使用back_coloring = imread(back_coloring_path)# 设置背景图片 ---- back_coloring为3维数组wc = WordCloud(#font_path = font_path# 设置字体background_color = 'white',# 设置背景颜色max_words = 3000,# 设置显示的最大词数mask = back_coloring,# 设置背景图片max_font_size = 200,# 设置字体最大值min_font_size = 5,# 设置字体最小值random_state = 42,# 随机有N种配色方案width = 1000 , height = 860 ,margin = 2 # 设置图片默认的大小,但是如果使用背景图片的话# 那么保存的图片大小会按照其大小保存,margin为词语边缘距离)#wc.generate(text) words = {}for i in word_counts:words['{}'.format(i[0])] = i[1]wc.generate_from_frequencies(words) # txt_freq例子为 { word1: fre1, word2: fre2,word3: fre3,......,wordn: fren }plt.figure() # 以下代码只显示--------形状与背景图片一致,颜色为默认颜色的词云 plt.imshow(wc) plt.axis("off") plt.show()# 绘制词云 wc.to_file(imgname1)# 保存图片# 以下代码显示--------形状与背景图片一致,颜色也与背景图颜色一致的词云 image_colors = ImageColorGenerator(back_coloring)# 从背景图片生成颜色值 plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") plt.show() wc.to_file( imgname2) # 显示原图片 plt.figure()plt.imshow(back_coloring, cmap=plt.cm.gray) plt.axis("off") plt.show() # 保存图片
数据分析部分
import pandas as pdimport numpy as np import matplotlib.pyplot as pltimport matplotlib.colors '''python学习交流群:1136201545更多学习资料可以加群获取'''get_ipython().magic('matplotlib inline')# 0、数据读取item_info = pd.read_csv('C:/Users/zbd/Desktop/Amazon/item_info.csv', engine = 'python')reviews_new = pd.read_csv('C:/Users/zbd/Desktop/Amazon/reviews_new.csv', engine = 'python')print(item_info.head())print(len(item_info))#print(reviews_new.head())# 1、清洗数据# 筛选出需要的列item_info_c = item_info[['Rank','item_name','store','price','Date_first_listed_on_Amazon','star','reviews','Read reviews that mention']]# 清洗列:priceitem_info_c['price'] = item_info_c['price'].str.replace('$','')item_info_c['min_price'] = item_info_c['price'].str.split('-').str[0].astype('float')item_info_c['max_price'] = item_info_c['price'].str.split('-').str[-1].astype('float')item_info_c['mean_price'] = (item_info_c['max_price']+item_info_c['min_price'])/2# 清洗NaN值def f_na(data,cols):for i in cols:data[i].fillna(data[i].mean(),inplace = True)return dataitem_info_c = f_na(item_info_c,['star','reviews','min_price','max_price','mean_price'])item_info_c.head(5)# 2、以商家维度处理数据a = item_info_c.groupby('store')['star'].mean().sort_values(ascending=False)# 商家星级均值b = item_info_c.groupby('store')['reviews'].agg({'reviews_sum':np.sum,'reviews_mean':np.mean})# 商家评论数总和、均值c = item_info_c.groupby('store')['min_price'].mean()# 商家最低价均值d = item_info_c.groupby('store')['max_price'].mean()# 商家最高价均值e = item_info_c.groupby('store')['mean_price'].mean()# 商家价格均值e.name = 'price_mean'f = item_info_c.groupby('store')['star'].count()# 商家商品数量f.name = 'item_num'#print(a,b,c,d,e,f)df = pd.concat([a,b,e,f],axis=1)# 商家商品数量百分比df['per'] = df['item_num']/100df['per%'] = df['per'].apply(lambda x: '%.2f%%' % (x*100))# 标准化处理def data_nor(df, *cols):for col in cols:colname = col + '_nor'df[colname] = (df[col]-df[col].min())/(df[col].max()-df[col].min()) * 10return df# 创建函数,结果返回标准化取值,新列列名df_re = data_nor(df, 'star','reviews_mean','price_mean','item_num')print(df_re.head(5))# 3、绘制图表fig,axes = plt.subplots(4,1,figsize = (10,15))plt.subplots_adjust(wspace =0, hspace =0.5)# 不同商家的星级排名df_star = df['star'].sort_values(ascending = False)df_star.plot(kind = 'bar',color = 'yellow',grid = True,alpha = 0.5,ax =axes[0],width =0.7,ylim = [3,5],title = '不同商家的星级排名')axes[0].axhline(df_star.mean(),label = '平均星级%.2f分' %df_star.mean() ,color = 'r' ,linestyle = '--',)axes[0].legend(loc = 1)# 不同商家的平均评论数排名df_reviews_mean = df['reviews_mean'].sort_values(ascending = False)df_reviews_mean.plot(kind = 'bar',color = 'blue',grid = True,alpha = 0.5,ax =axes[1],width =0.7,title = '不同商家的平均评论数排名')axes[1].axhline(df_reviews_mean.mean(),label = '平均评论数%i条' %df_reviews_mean.mean() ,color = 'r' ,linestyle = '--',)axes[1].legend(loc = 1)# 不同商家的价格区间(按均价)avg_price = (d-c)/2avg_price.name = 'avg_price'max_price = avg_price.copy()max_price.name = 'max_price'df_price = pd.concat([c,avg_price,max_price,df_re['price_mean']],axis=1)df_price = df_price.sort_values(['price_mean'],ascending = False)df_price.drop(['price_mean'],axis =1,inplace = True)df_price.plot(kind = 'bar',grid = True,alpha = 0.5 , ax =axes[2],width =0.7,stacked = True,color= ['white','red','blue'],ylim = [0,55],title = '不同商家的价格区间')# 不同商家的加权分排名df_nor = pd.concat([df_re['star_nor'],df_re['reviews_mean_nor'],df_re['price_mean_nor'],df_re['item_num_nor']],axis =1)df_nor['nor_total'] = df_re['star_nor'] + df_re['reviews_mean_nor'] + df_re['price_mean_nor'] + df_re['item_num_nor']df_nor = df_nor.sort_values(['nor_total'],ascending = False)df_nor.drop(['nor_total'],axis = 1,inplace = True)df_nor.plot(kind = 'bar',grid = True,alpha = 0.5 , ax =axes[3],width =0.7,stacked = True,title = '不同商家的加权分排名')# 商家数量饼图colors = ['aliceblue','antiquewhite','beige','bisque','blanchedalmond','blue','blueviolet','brown','burlywood','cadetblue','chartreuse','chocolate','coral','cornflowerblue','cornsilk','crimson','cyan','darkblue','darkcyan','darkgoldenrod','darkgreen','darkkhaki','darkviolet','deeppink','deepskyblue','dimgray','dodgerblue','firebrick','floralwhite','forestgreen','gainsboro','ghostwhite','gold','goldenrod']df_per = df_re['item_num']fig,axes = plt.subplots(1,1,figsize = (8,8))plt.axis('equal') #保证长宽相等plt.pie(df_per , labels = df_per.index , autopct = '%.2f%%',pctdistance = 1.05 , #shadow = True ,startangle = 0 ,radius = 1.5 , colors = colors,frame = False)# 不同商家的星级/价格散点图plt.figure(figsize=(13,8))x = df_re['price_mean']# x轴为均价y = df_re['star']# y轴为星级s = df_re['item_num']*100# 点大小为商品数量,商品数量越大,点越大c = df_re['reviews_mean']*10# 点颜色为评论均值,评论均值越大,颜色越深红plt.scatter(x,y,marker='.',cmap='Reds',alpha=0.8,s = s,c = c)plt.grid()plt.title('不同商家的星级/价格散点图')plt.xlim([0,50])plt.ylim([3,5])plt.xlabel('price')plt.ylabel('star')# 绘制平均线、图例p_mean = df_re['price_mean'].mean()s_mean = df_re['star'].mean()plt.axvline(p_mean,label = '平均价格%.2f$' %p_mean ,color = 'r' ,linestyle = '--',)plt.axhline(s_mean,label = '平均星级%.2f' %s_mean ,color = 'g' ,linestyle = '-.')plt.axvspan(p_mean, 50, ymin= (s_mean-3)/(5-3), ymax=1,alpha = 0.1,color = 'g')plt.axhspan(0, s_mean, xmin= 0 , xmax=p_mean/50,alpha = 0.1,color = 'grey')plt.legend(loc = 2)# 添加商家标签for x,y,name in zip(df_re['price_mean'],df_re['star'],df_re.index):plt.annotate(name, xy=(x,y),xytext = (0, -5), textcoords = 'offset points',ha = 'center', va = 'top',fontsize = 9)# 清洗列:Read reviews that mentiondf_rrtm = item_info_c['Read reviews that mention'].fillna('缺失数据',inplace =False)df_rrtm = df_rrtm.str.strip('[')df_rrtm = df_rrtm.str.rstrip(']')df_rrtm = df_rrtm.str.replace('\'','')reviews_labels = []for i in df_rrtm:reviews_labels = reviews_labels+i.split(',')#print(reviews_labels)labels = []for j in reviews_labels:if j != '缺失数据':labels.append(j)#print(labels)# 统计标签词频counts = {}for i in labels:counts[i] = counts.get(i,0) + 1#print(counts)label_counts = list(counts.items())#print(word_counts)label_counts.sort(key = lambda x:x[1],reverse = True)# 按词频降序排列print('总共%i个评论标签,Top20如下:'%len(label_counts))print('-----------------------------')# 输出结果for i in label_counts[:20]:print(i)