netflix数据处理2(转)
生活随笔
收集整理的這篇文章主要介紹了
netflix数据处理2(转)
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
原始數(shù)據(jù):
$head -10 mv_0006890.txt
6890:
1735266,1,2004-04-02
1008399,1,2004-06-22
2360117,2,2003-11-08
1294425,2,2004-03-15
439931,4,2004-03-27
1583311,1,2004-03-11
2431832,3,2005-02-13
620771,2,2004-03-20
1110906,1,2004-03-04
結(jié)果數(shù)據(jù):user_id movie_id rating
$head -10 ratings_0.txt
499040 9419 3
2071637 9419 4
896780 9419 3
2625420 9419 2
652121 9419 3
1003291 9419 4
818736 9419 3
332152 9419 2
2174771 9419 4
47411 9419 5
import sys
import os
import re
CHUNK_FILES = True
def mkdir(path):
????if not os.path.exists(path):
????????os.makedirs(path)
????else:
????????pass
def main(args):
????outfile = open('reformatted_movie_titles.txt', 'w')
????movie_title_file = open('movie_titles.txt','r')
????movie_title_exp=re.compile("([\w]+),([\w]+),(.*)")
????movie_titles={}
????for line in movie_title_file:
??????m = movie_title_exp.match(line.strip())
??????outfile.write('%s\t%s\n' % (m.group(1), m.group(3)))
????outfile.close()
????movie_title_file.close() ?
?
????in_dir= args[1] #'輸入文件路徑'
????out_dir = args[2] #'輸出文件路徑'
????filenames = [in_dir +'/' + file for file in os.listdir(in_dir)]
????rating_count = 0
????L = 0
????outfile_num = 0
????mkdir(out_dir)
????outfilename = out_dir+ '/' + 'ratings_'+ str(outfile_num) +'.txt'????
????output_file = open(outfilename, 'w')
????for i, moviefile in enumerate(filenames):
????????print "processing movie %s " % (i+1)
????????f = open(moviefile,'r')
????????for j, line in enumerate(f.readlines()):
????????????if j == 0:
????????????????movieid = line.split(':')[0]
????????????else:
????????????????(userid, rating, date) = line.split(',')
????????????????nextline = ' '.join([userid, movieid, rating+'\n'])
????????????????L += len(nextline) # 如果長度達(dá)到 65536, 新建一個文件
????????????????if L/1000 > 65536 and CHUNK_FILES:
????????????????????output_file.close()
????????????????????outfile_num += 1
????????????????????outfilename = out_dir+ '/' + \
????????????????????'ratings_'+ str(outfile_num) +'.txt'
????????????????????print "--- starting new file: %s" % outfilename
????????????????????output_file = open(outfilename, 'w')
????????????????????L = len(nextline)
????????????????output_file.write(nextline)
????????????????rating_count += 1
????????f.close()???
????output_file.close()
??????????
if __name__ == '__main__':
????main(sys.argv)
經(jīng)過處理,得到多個用戶評分?jǐn)?shù)據(jù)集,合并到一個文件
#!/bin/bash
for x in netflix-data/ratings_*.txt ;
?do cat $x >> result.txt ;
done &
$head -10 result.txt
499040 9419 3
2071637 9419 4
896780 9419 3
2625420 9419 2
652121 9419 3
1003291 9419 4
818736 9419 3
332152 9419 2
2174771 9419 4
47411 9419 5
$head -10 mv_0006890.txt
6890:
1735266,1,2004-04-02
1008399,1,2004-06-22
2360117,2,2003-11-08
1294425,2,2004-03-15
439931,4,2004-03-27
1583311,1,2004-03-11
2431832,3,2005-02-13
620771,2,2004-03-20
1110906,1,2004-03-04
結(jié)果數(shù)據(jù):user_id movie_id rating
$head -10 ratings_0.txt
499040 9419 3
2071637 9419 4
896780 9419 3
2625420 9419 2
652121 9419 3
1003291 9419 4
818736 9419 3
332152 9419 2
2174771 9419 4
47411 9419 5
import sys
import os
import re
CHUNK_FILES = True
def mkdir(path):
????if not os.path.exists(path):
????????os.makedirs(path)
????else:
????????pass
def main(args):
????outfile = open('reformatted_movie_titles.txt', 'w')
????movie_title_file = open('movie_titles.txt','r')
????movie_title_exp=re.compile("([\w]+),([\w]+),(.*)")
????movie_titles={}
????for line in movie_title_file:
??????m = movie_title_exp.match(line.strip())
??????outfile.write('%s\t%s\n' % (m.group(1), m.group(3)))
????outfile.close()
????movie_title_file.close() ?
?
????in_dir= args[1] #'輸入文件路徑'
????out_dir = args[2] #'輸出文件路徑'
????filenames = [in_dir +'/' + file for file in os.listdir(in_dir)]
????rating_count = 0
????L = 0
????outfile_num = 0
????mkdir(out_dir)
????outfilename = out_dir+ '/' + 'ratings_'+ str(outfile_num) +'.txt'????
????output_file = open(outfilename, 'w')
????for i, moviefile in enumerate(filenames):
????????print "processing movie %s " % (i+1)
????????f = open(moviefile,'r')
????????for j, line in enumerate(f.readlines()):
????????????if j == 0:
????????????????movieid = line.split(':')[0]
????????????else:
????????????????(userid, rating, date) = line.split(',')
????????????????nextline = ' '.join([userid, movieid, rating+'\n'])
????????????????L += len(nextline) # 如果長度達(dá)到 65536, 新建一個文件
????????????????if L/1000 > 65536 and CHUNK_FILES:
????????????????????output_file.close()
????????????????????outfile_num += 1
????????????????????outfilename = out_dir+ '/' + \
????????????????????'ratings_'+ str(outfile_num) +'.txt'
????????????????????print "--- starting new file: %s" % outfilename
????????????????????output_file = open(outfilename, 'w')
????????????????????L = len(nextline)
????????????????output_file.write(nextline)
????????????????rating_count += 1
????????f.close()???
????output_file.close()
??????????
if __name__ == '__main__':
????main(sys.argv)
經(jīng)過處理,得到多個用戶評分?jǐn)?shù)據(jù)集,合并到一個文件
#!/bin/bash
for x in netflix-data/ratings_*.txt ;
?do cat $x >> result.txt ;
done &
$head -10 result.txt
499040 9419 3
2071637 9419 4
896780 9419 3
2625420 9419 2
652121 9419 3
1003291 9419 4
818736 9419 3
332152 9419 2
2174771 9419 4
47411 9419 5
轉(zhuǎn)載于:https://www.cnblogs.com/qq78292959/archive/2011/05/31/2076601.html
總結(jié)
以上是生活随笔為你收集整理的netflix数据处理2(转)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 开源GIS---.Net系列
- 下一篇: CodeSmith注册机,支持5.2.2