python 代码分块_[代码全屏查看]-python多进程分块读取文件
[1].[代碼] [Python]代碼
# -*- coding: GBK -*-
import urlparse
import datetime
import os
from multiprocessing import Process,Queue,Array,RLock
"""
多進程分塊讀取文件
"""
WORKERS = 4
BLOCKSIZE = 100000000
FILE_SIZE = 0
def getFilesize(file):
"""
獲取要讀取文件的大小
"""
global FILE_SIZE
fstream = open(file,'r')
fstream.seek(0,os.SEEK_END)
FILE_SIZE = fstream.tell()
fstream.close()
def process_found(pid,array,file,rlock):
global FILE_SIZE
global JOB
global PREFIX
"""
進程處理
Args:
pid:進程編號
array:進程間共享隊列,用于標記各進程所讀的文件塊結(jié)束位置
file:所讀文件名稱
各個進程先從array中獲取當前最大的值為起始位置startpossition
結(jié)束的位置endpossition (startpossition+BLOCKSIZE) if (startpossition+BLOCKSIZE)
if startpossition==FILE_SIZE則進程結(jié)束
if startpossition==0則從0開始讀取
if startpossition!=0為防止行被block截斷的情況,先讀一行不處理,從下一行開始正式處理
if 當前位置 <=endpossition 就readline
否則越過邊界,就從新查找array中的最大值
"""
fstream = open(file,'r')
while True:
rlock.acquire()
print 'pid%s'%pid,','.join([str(v) for v in array])
startpossition = max(array)
endpossition = array[pid] = (startpossition+BLOCKSIZE) if (startpossition+BLOCKSIZE)
rlock.release()
if startpossition == FILE_SIZE:#end of the file
print 'pid%s end'%(pid)
break
elif startpossition !=0:
fstream.seek(startpossition)
fstream.readline()
pos = ss = fstream.tell()
ostream = open('/data/download/tmp_pid'+str(pid)+'_jobs'+str(endpossition),'w')
while pos
#處理line
line = fstream.readline()
ostream.write(line)
pos = fstream.tell()
print 'pid:%s,startposition:%s,endposition:%s,pos:%s'%(pid,ss,pos,pos)
ostream.flush()
ostream.close()
ee = fstream.tell()
fstream.close()
def main():
global FILE_SIZE
print datetime.datetime.now().strftime("%Y/%d/%m %H:%M:%S")
file = "/data/pds/download/scmcc_log/tmp_format_2011004.log"
getFilesize(file)
print FILE_SIZE
rlock = RLock()
array = Array('l',WORKERS,lock=rlock)
threads=[]
for i in range(WORKERS):
p=Process(target=process_found, args=[i,array,file,rlock])
threads.append(p)
for i in range(WORKERS):
threads[i].start()
for i in range(WORKERS):
threads[i].join()
print datetime.datetime.now().strftime("%Y/%d/%m %H:%M:%S")
if __name__ == '__main__':
main()
總結(jié)
以上是生活随笔為你收集整理的python 代码分块_[代码全屏查看]-python多进程分块读取文件的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: python案例教程黄蔚答案_Pytho
- 下一篇: python核心教程百度云_Python