舆情分析项目-重庆公交坠江原因
輿情分析項(xiàng)目
1、分析事件:重慶公交墜江原因
2、分析對(duì)象:
(1)網(wǎng)友評(píng)論(初級(jí)分類-分詞匹配;高級(jí)分類-自然語言識(shí)別,映射人類情感和意圖,比如:積極、消極、無奈、諷刺、建設(shè)、謾罵、理性分析、事后、和事佬等)
(2)評(píng)論者的公網(wǎng)IP(依據(jù)公網(wǎng)IP識(shí)別不同地域的網(wǎng)絡(luò)用戶,對(duì)本次事件的關(guān)注度)
(3)評(píng)論者的省份屬性(同上)
3、數(shù)據(jù)來源:
新浪評(píng)論:http://comment5.news.sina.com.cn/comment/skin/default.html?channel=gn&newsid=comos-hnfikve6671738&group=0
4、其他:
準(zhǔn)備數(shù)據(jù):(直接用:中國(guó)省份數(shù)據(jù)庫(kù),世界國(guó)家名稱數(shù)據(jù)庫(kù))參考本人博客
(1)中國(guó)的行政區(qū)劃數(shù)據(jù),包括全國(guó)的省、市、縣(參考csdn、民政部官網(wǎng))
(2)世界的國(guó)家數(shù)據(jù)(參考csdn)
(一)輿情分析項(xiàng)目之?dāng)?shù)據(jù)準(zhǔn)備:采集評(píng)論數(shù)據(jù)
1、采集字段
三個(gè)字段:評(píng)論、IP、省份
其他字段:收到點(diǎn)贊數(shù)等等
2、Python實(shí)現(xiàn)數(shù)據(jù)采集
文件結(jié)構(gòu)
?
(1)python主代碼
?01-busremark.py中
import json import requests import pymysql import time as timeimport from mylog import Loggerlogger1 = Logger(logfile='log1.log', logname="log1", logformat=1).getlog() # 使用自定義日志對(duì)象# 連接數(shù)據(jù)庫(kù) connect = pymysql.Connect(host='localhost',port=3306,user='root',passwd='root',db='analyze',charset='utf8' ) # 獲取游標(biāo) cursor = connect.cursor()# 創(chuàng)建數(shù)據(jù)庫(kù)語句for page_num in range(1, 6001): # 從1采集到6000條評(píng)論if page_num % 50 == 0: # 每采集50條數(shù)據(jù),休息2秒timeimport.sleep(2)url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-hnfikve6671738&group=0&compress=0&ie=utf8&oe=utf8&page=" + str(page_num) + "&page_size=1&jsvar=loader_1541133929419_28637561"# url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-hnfikve6671738&group=0&compress=0&ie=gbk&oe=gbk&page=1&page_size=2&jsvar=loader_1541133929419_28637561"headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}try: # 嘗試采集# 發(fā)出請(qǐng)求獲取響應(yīng)response = requests.get(url, headers=headers)data_str = response.content.decode('unicode_escape')# 排除干擾字符串data_str = data_str.lstrip("var loader_1541133929419_28637561=")# print(data_str)# str轉(zhuǎn)字典data_dict = json.loads(data_str)print(type(data_dict))# 獲取每次響應(yīng)中的所有評(píng)論all_remarks = data_dict['result']['cmntlist']print(len(all_remarks))i = 0for c in all_remarks: # 遍歷每次響應(yīng)中的評(píng)論,并存入mysqli += 1print(i, "*" * 100)nick = c["nick"] # 昵稱content = c["content"] # 評(píng)論agree = int(c["agree"]) # 收到點(diǎn)贊area = c["area"] # 地區(qū)ip = c["ip"] # 源iptime = c["time"] # 評(píng)論發(fā)布時(shí)間profile_img = c["profile_img"] # 頭像print(nick)print(content)print(agree)print(ip)print(time)print(profile_img)# sql操作# 增加數(shù)據(jù)操作sql_1 = "insert into all_remarks(nick, content, agree, area, ip, time, profile_img) values(%s,%s,%s,%s,%s,%s,%s)"data = (nick, content, agree, area, ip, time, profile_img)cursor.execute(sql_1, data) # 生成增加sql語句connect.commit() # 確認(rèn)永久執(zhí)行增加except Exception as e: # 采集異常處理my_e = str(e) + " ==> " + str(url)logger1.warning(my_e) # 定義調(diào)試日志內(nèi)容# print(my_e)continue # 忽視異常,進(jìn)行后面的采集
?
(2)python日志
?mylog.py中
# 開發(fā)一個(gè)日志系統(tǒng), 既要把日志輸出到控制臺(tái), 還要寫入日志文件 import logging# 用字典保存輸出格式 format_dict = {1: logging.Formatter('%(asctime)s - %(name)s - %(filename)s - %(levelname)s - %(message)s'),2: logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'),3: logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'),4: logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'),5: logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') }class Logger():def __init__(self, logfile, logname, logformat):'''指定保存日志的文件路徑,日志級(jí)別,以及調(diào)用文件將日志存入到指定的文件中'''# 創(chuàng)建一個(gè)loggerself.logger = logging.getLogger(logname)self.logger.setLevel(logging.DEBUG)# 創(chuàng)建一個(gè)handler,用于寫入日志文件fh = logging.FileHandler(logfile)fh.setLevel(logging.DEBUG)# 再創(chuàng)建一個(gè)handler,用于輸出到控制臺(tái)ch = logging.StreamHandler()ch.setLevel(logging.DEBUG)# 定義handler的輸出格式# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')formatter = format_dict[int(logformat)]fh.setFormatter(formatter)ch.setFormatter(formatter)# 給logger添加handlerself.logger.addHandler(fh)self.logger.addHandler(ch)def getlog(self):return self.loggerif __name__ == '__main__':logger1 = Logger(logfile='log1.txt', logname="fox1", logformat=1).getlog()logger1.debug('i am debug')logger1.info('i am info')logger1.warning('i am warning')logger2 = Logger(logfile='log2.txt', logname="fox2", logformat=2).getlog()logger2.debug('i am debug2')logger2.info('i am info2')logger2.warning('i am warning2')
3、sql建表語句
?
/* Navicat MySQL Data TransferSource Server : win7_local Source Server Version : 50717 Source Host : localhost:3306 Source Database : analyzeTarget Server Type : MYSQL Target Server Version : 50717 File Encoding : 65001Date: 2018-11-06 19:33:57 */SET FOREIGN_KEY_CHECKS=0;-- ---------------------------- -- Table structure for all_remarks -- ---------------------------- DROP TABLE IF EXISTS `all_remarks`; CREATE TABLE `all_remarks` (`id` int(11) unsigned NOT NULL AUTO_INCREMENT,`nick` varchar(255) DEFAULT NULL,`content` text,`agree` int(10) DEFAULT NULL,`area` varchar(100) DEFAULT NULL,`ip` varchar(20) DEFAULT NULL,`time` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,`profile_img` varchar(255) DEFAULT NULL,`province_brief` varchar(20) DEFAULT NULL,PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
?
?
4、效果截圖
?
?
中途在添加了字段
?
?
02-mysql_to_province_country.py import pymysql# 連接數(shù)據(jù)庫(kù) connect = pymysql.connect(host='localhost',port=3306,user='root',passwd='root',db='analyze',charset='utf8' ) # 獲取游標(biāo) cursor = connect.cursor()# 獲取中國(guó)所有的省份二字簡(jiǎn)稱 sql_2 = "select brief from tb_provinces" ret_num2 = cursor.execute(sql_2) # 生成查詢sql語句,并且執(zhí)行。ret_num表示受影響的記錄條數(shù) pro_data = cursor.fetchall() # 獲取查詢結(jié)果 # 取出所有省份直轄市等列表 pro_list = [] for pro in pro_data:pro_list.append(pro[0]) print(pro_list)# 獲取國(guó)家名稱列表 sql_get_country = "select country from tb_countries" ret_num_cou = cursor.execute(sql_get_country) countries = cursor.fetchall() # print(countries) # 取出所有國(guó)家名稱列表 countries_list = [] for country in countries:countries_list.append(country[0]) print(countries_list) print(len(countries_list))# 循環(huán)給每條記錄打省份標(biāo)簽 for i in range(1, 2000):# 查詢數(shù)據(jù)操作(只有查詢用的全是游標(biāo),其他3種操作,要用連接的提交commit)sql_1 = "select id,area,province_brief from all_remarks where province_brief is null or province_brief='' limit 1"# sql_1 = "select id,area,province_brief from all_remarks limit 1"ret_num = cursor.execute(sql_1) # 生成查詢sql語句,并且執(zhí)行。ret_num表示受影響的記錄條數(shù)if ret_num < 1: # 沒有獲取到數(shù)據(jù)庫(kù)任何結(jié)果,終止本次任務(wù)breakdata = cursor.fetchall() # 獲取查詢結(jié)果print(data)# 數(shù)據(jù)準(zhǔn)備id = data[0][0] # idlocation = data[0][1] # 地理位置詳情# print(location)# 判斷歸屬省份for pro in pro_list:if pro in location:# print(pro)province_brief = pro# 修改數(shù)據(jù)操作sql_3 = "update all_remarks set province_brief=%s where id=%s"data = (pro, id)cursor.execute(sql_3, data) # 生成增加sql語句connect.commit() # 確認(rèn)永久執(zhí)行增加# print("執(zhí)行完畢")breakelse: # 上面循環(huán)完成,沒有匹配到對(duì)應(yīng)省份時(shí)print("id=%s,不屬于任何省份" % id)print("開始判斷屬于哪個(gè)國(guó)家")for country in countries_list:if country in location:# print(country)# 修改數(shù)據(jù)操作sql_4 = "update all_remarks set province_brief=%s where id=%s"data = (country, id)cursor.execute(sql_4, data) # 生成增加sql語句connect.commit() # 確認(rèn)永久執(zhí)行增加print("id=%s ,屬于 %s" % (id, country))breakelse: # 上面循環(huán)完成,沒有匹配到對(duì)應(yīng)國(guó)家時(shí)print("位置異常,沒有匹配到任何省份和國(guó)家:%s" % location)# 關(guān)閉指針 cursor.close() # 關(guān)閉連接 connect.close()
03-matplotlib_provinc_count.py
import pymysql import matplotlib.pyplot as plt import matplotlib # 載入matplotlib完整庫(kù)matplotlib.rcParams['font.family'] = 'Microsoft Yahei' # 字體,改為微軟雅黑,默認(rèn) sans-serif matplotlib.rcParams['font.size'] = 18 # 字體大小,整數(shù)字號(hào),默認(rèn)10# 連接數(shù)據(jù)庫(kù) connect = pymysql.connect(host='localhost',port=3306,user='root',passwd='root',db='analyze',charset='utf8' ) # 獲取游標(biāo) cursor = connect.cursor()# 獲取數(shù)據(jù) sql_1 = "select province_brief,count_id from stst_count_province" ret_num2 = cursor.execute(sql_1) # 生成查詢sql語句,并且執(zhí)行。ret_num表示受影響的記錄條數(shù) pro_data = cursor.fetchall() # 獲取查詢結(jié)果 # print(pro_data)# 按照count_id 降序排列 list1 = list(pro_data) # print(list1) list2 = sorted(list1, key=lambda p: p[1], reverse=True) print(list2)# 關(guān)閉指針 cursor.close() # 關(guān)閉連接 connect.close()# 繪圖 # 導(dǎo)入待繪圖處理數(shù)據(jù) base_data = list2# 獲取數(shù)據(jù) province_list = [x[0] for x in base_data] count_id_list = [x[1] for x in base_data] print(province_list) print(count_id_list) # 設(shè)置x,y x = [i for i in range(len(province_list))] y = count_id_listplt.figure(figsize=(20, 10), dpi=80) plt.bar(x,y,width=0.5,color='r' )# 設(shè)置x軸刻度 _xticks_labels = [str(index + 1) + " " + value for index, value in enumerate(province_list)] plt.xticks(x, _xticks_labels, rotation=40, fontsize=12) # 設(shè)置y軸刻度 # y_new = [i for i in range(0, 701)][::50] # plt.yticks(y_new)# 設(shè)置網(wǎng)格 plt.grid()# 設(shè)置文字 plt.title("中國(guó)各個(gè)省份對(duì)《重慶公交墜江事件》關(guān)注度統(tǒng)計(jì) 數(shù)據(jù)來源:sina") plt.xlabel("省/直轄市/特別行政區(qū)", color='b') plt.ylabel("評(píng)論數(shù)", color='black')plt.show()
?
最終效果:
轉(zhuǎn)載于:https://www.cnblogs.com/andy9468/p/9897391.html
總結(jié)
以上是生活随笔為你收集整理的舆情分析项目-重庆公交坠江原因的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【贪心】小Y的炮[cannon]题解
- 下一篇: 织梦正则批量替换文章内容内链变成绝对路径