pre_clean.py

import csv
import json
import re

# 读取 JSON 元数据（题号 -> slug）
def load_metadata(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    slug_map = {}
    for q in data.get('stat_status_pairs', []):
        # 标准 LC 题号
        if q.get("stat") and q["stat"].get("frontend_question_id") and q["stat"].get("question__title_slug"):
            id_str = str(q["stat"]["frontend_question_id"])
            slug_map[id_str] = q["stat"]["question__title_slug"]
        # LCR 题号
        if q.get("titleCn") and q.get("titleSlug"):
            match = re.match(r"LCR ?(\d+)", q["titleCn"])
            if match:
                id_str = f"LCR {match.group(1)}"
                slug_map[id_str] = q["titleSlug"]
    return slug_map

# 读取 tree.txt 文件
def parse_tree_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]

    problems = []
    for i in range(0, len(lines), 3):
        id_and_title = lines[i]
        percent = lines[i+1]
        difficulty = lines[i+2]

        # 提取题号和题目名
        match = re.match(r"(.+?)\. (.+)", id_and_title)
        if match:
            qid, title = match.groups()
        else:
            qid, title = "UNKNOWN", id_and_title

        probability = int(float(percent.strip('%')))
        problems.append((qid.strip(), title.strip(), difficulty.strip(), probability))
    return problems

# 输出为 CSV 文件
def write_csv(output_path, problems, slug_map):
    with open(output_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['题号', '题目名', '难度', '次数', '公司', '概率', '链接'])

        for qid, title, difficulty, prob in problems:
            # 题号处理
            if qid.isdigit():
                qcode = f"LC-{qid}"
                slug = slug_map.get(qid, "")
            elif qid.startswith("LCR"):
                qcode = qid.replace(" ", "")
                slug = slug_map.get(qid, "")
            else:
                qcode = qid
                slug = ""

            url = f"https://leetcode.cn/problems/{slug}/" if slug else ""

            writer.writerow([
                qcode,
                title,
                difficulty.capitalize(),  # 转换为首字母大写
                3,  # 次数统一为3
                "the company",
                prob,
                url
            ])

# 主函数
def main():
    tree_txt = 'raw.txt'
    meta_json = 'leetcode_problems_metadata.json'
    output_csv = 'raw.csv'

    slug_map = load_metadata(meta_json)
    problems = parse_tree_txt(tree_txt)
    write_csv(output_csv, problems, slug_map)
    print(f'✅ 成功输出到 {output_csv}')

if __name__ == '__main__':
    main()