AI智能
改变未来

python检测文件去重复

#!/usr/bin/env python
#coding: utf-8

import md5
import os
from time import clock as now


def getmd5(filename):
    file_txt = open(filename, 'rb').read()
    m = md5.new(file_txt)
    return m.hexdigest()


def main():
    path = u'/home/'
    all_md5 = {}
    all_size = {}
    total_file = 0
    total_delete = 0
    start = now()
    for file in os.listdir(path):
        total_file += 1
        real_path = os.path.join(path, file)
        if os.path.isfile(real_path) == True:
            size = os.stat(real_path).st_size
            name_and_md5 = [real_path, '']
            if size in all_size.keys():
                new_md5 = getmd5(real_path)
                if all_size[size][1] == '':
                    all_size[size][1] = getmd5(all_size[size][0])
                if new_md5 in all_size[size]:
                    total_delete += 1
                    print u'删除', file
                    try:
                        os.remove(os.path.join(path, file))
                    except:
                        print 'No such file: %s' % file
                else:
                    all_size[size].append(new_md5)
            else:
                all_size[size] = name_and_md5
    end = now()
    time_last = end - start
    print u'文件总数: ', total_file
    print u'删除个数: ', total_delete
    print u'耗时: ', time_last, '秒'


if __name__ == '__main__':
    main()
赞(1) 打赏
未经允许不得转载:爱站程序员基地 » python检测文件去重复