How to remove the duplicate of .bib files

less than 1 minute read

I have multiples chapters which need to be merged into one thesis. But part of their reference.bib files are overlapped. I need to find a way to remove the duplicate when I merge the .bib files.

I searched on google but no good results. So I wrote a script myself (with the help of GPT of course). The full script is as follows.

def read_bib_file(file_path):
    """读取并分割.bib文件的内容"""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read().split('\n@')
    return content

def remove_duplicates(bib_items):
    """去除重复的文献项"""
    unique_items = {}
    for item in bib_items:
        label = item.split('{', 1)[1].split(',', 1)[0].strip()
        if label not in unique_items:
            unique_items[label] = item
    return list(unique_items.values())

def write_bib_file(file_path, bib_items):
    """写入处理后的.bib文件"""
    with open(file_path, 'w', encoding='utf-8') as file:
        for item in bib_items:
            file.write('@' + item + '\n')

# 主函数
def main():
    input_path = '/home/jjia/data/lung_function/lung_function/scripts/reference.bib'  # 替换为你的.bib文件路径
    output_path = '/home/jjia/data/lung_function/lung_function/scripts/reference_clean.bib' # 替换为输出文件的路径

    bib_items = read_bib_file(input_path)
    unique_items = remove_duplicates(bib_items)
    write_bib_file(output_path, unique_items)

    print(f"处理完成,去重后的文件已保存到 {output_path}")

if __name__ == "__main__":
    main()


Leave a Comment