#!/usr/bin/env python
#coding: utf-8
import md5
import os
from time import clock as now
def getmd5(filename):
file_txt = open(filename, 'rb').read()
m = md5.new(file_txt)
return m.hexdigest()
def main():
path = u'/home/'
all_md5 = {}
all_size = {}
total_file = 0
total_delete = 0
start = now()
for file in os.listdir(path):
total_file += 1
real_path = os.path.join(path, file)
if os.path.isfile(real_path) == True:
size = os.stat(real_path).st_size
name_and_md5 = [real_path, '']
if size in all_size.keys():
new_md5 = getmd5(real_path)
if all_size[size][1] == '':
all_size[size][1] = getmd5(all_size[size][0])
if new_md5 in all_size[size]:
total_delete += 1
print u'删除', file
try:
os.remove(os.path.join(path, file))
except:
print 'No such file: %s' % file
else:
all_size[size].append(new_md5)
else:
all_size[size] = name_and_md5
end = now()
time_last = end - start
print u'文件总数: ', total_file
print u'删除个数: ', total_delete
print u'耗时: ', time_last, '秒'
if __name__ == '__main__':
main()
python检测文件去重复
未经允许不得转载:爱站程序员基地 » python检测文件去重复