python novice, wrote a program to calculate the file md5, can run through. Then I got confused when I tried to transform it into multithreading. For two-core machines, the calculation time spent on dual-threading is the same as that of not using multi-threading, which is caused by some problem. There are 44 files in the test environment, each file 200MB-400MB varies, with a total size of 12.1 GB, and the running time of both pieces of code is about 42 seconds.
then tried to use multiple processes, but the processing time did not improve
not using multithreaded code:
-sharp!/usr/bin/python3
import os, hashlib, binascii, pymysql, time, json, datetime
def listFiles(dir):
paths = []
for root,dirs,files in os.walk(dir):
for file in files:
paths.append(os.path.join(root,file))
return paths
def calcMD5(filePath, block_size=2**20):
md5 = hashlib.md5()
f = open(filePath, "rb")
while True:
data = f.read(block_size)
if not data:
break
md5.update(data)
f.close()
return md5.hexdigest()
files = listFiles("/data/S01")
result = []
startTime = datetime.datetime.now()
for i in files:
fileMD5 = calcMD5(i)
result.append(fileMD5)
print(result)
endTime = datetime.datetime.now()
timeDiff = endTime - startTime
timeDiffSeconds = timeDiff.seconds
print("{0}{1}".format(int(timeDiffSeconds/60), int(timeDiffSeconds%60)))
use multithreaded code:
-sharp!/usr/bin/python3
import os, hashlib, binascii, pymysql, time, json, datetime, threading, queue
def listFiles(dir):
paths = []
for root,dirs,files in os.walk(dir):
for file in files:
paths.append(os.path.join(root,file))
return paths
class threadMD5(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
try:
filePath = self.queue.get(block=False)
except Exception as e:
print("thread end")
break
fileMD5 = calcMD5(filePath)
self.queue.task_done()
def calcMD5(filePath, block_size=2**20):
md5 = hashlib.md5()
f = open(filePath, "rb")
while True:
data = f.read(block_size)
if not data:
break
md5.update(data)
f.close()
return md5.hexdigest()
startTime = datetime.datetime.now()
files = listFiles("/data/S01")
result = []
-sharp
queue = queue.Queue()
for i in files:
queue.put(i, block=False)
threads = []
for i in range(2):
t = threadMD5(queue)
t.setDaemon(True)
t.start()
threads.append(t)
for i in threads:
i.join()
print(result)
endTime = datetime.datetime.now()
timeDiff = endTime - startTime
timeDiffSeconds = timeDiff.seconds
print("{0}{1}".format(int(timeDiffSeconds/60), int(timeDiffSeconds%60)))
:
import os, hashlib, time, datetime
import multiprocessing as mp
results = []
def listFiles(dir):
paths = []
for root,dirs,files in os.walk(dir):
for file in files:
paths.append(os.path.join(root,file))
return paths
def calcMD5(filePath, block_size=2**20):
md5 = hashlib.md5()
f = open(filePath, "rb")
while True:
data = f.read(block_size)
if not data:
break
md5.update(data)
f.close()
return md5.hexdigest()
def collect_results(result):
results.extend(result)
if __name__ == "__main__":
p = mp.Pool(processes=2)
files = listFiles("/data/S01")
startTime = datetime.datetime.now()
for f in files:
p.apply_async(calcMD5, args=(f, ), callback=collect_results)
p.close()
p.join()
print(results)
endTime = datetime.datetime.now()
timeDiff = endTime - startTime
timeDiffSeconds = timeDiff.seconds
print("{0}{1}".format(int(timeDiffSeconds/60), int(timeDiffSeconds%60)))