收割cell

In [ ]:
  Copied!     
 
# 需求：pip install rapidfuzz regex
import subprocess, re, json, os
from collections import Counter, defaultdict
from rapidfuzz import fuzz

REPO = "/home/jovyan/work"  # 改成你的 repo 根目錄
AUTO_DIR = "transcripts/auto"
HUMAN_DIR = "transcripts/human"
RES = os.path.join(REPO, "resources")
os.makedirs(RES, exist_ok=True)
MAP_PATH  = os.path.join(RES, "manual_map.json")
CONF_PATH = os.path.join(RES, "confusion.json")
DOM_PATH  = os.path.join(RES, "domain_terms.txt")

def _load_json(p, default):
    if os.path.exists(p):
        return json.load(open(p, "r", encoding="utf-8"))
    return default

manual_map = _load_json(MAP_PATH, {})
confusion  = _load_json(CONF_PATH, {})  # dict<char, dict<char,int>>
domain = set(open(DOM_PATH,"r",encoding="utf-8").read().splitlines() if os.path.exists(DOM_PATH) else [])

def git_diff_path(a, b):
    cmd = ["git","diff","--no-color","--unified=0","--",a,b]
    out = subprocess.check_output(cmd, cwd=REPO).decode("utf-8", "ignore")
    return out

def harvest_pairs(diff_text, min_len_word=2):
    # 收集 word 級與 char 級更動
    word_pairs = Counter()
    char_pairs = Counter()
    a_buf, b_buf = [], []
    for line in diff_text.splitlines():
        if line.startswith("@@"): 
            a_buf, b_buf = [], []
        elif line.startswith("-"):
            a_buf.append(line[1:].strip())
        elif line.startswith("+"):
            b_buf.append(line[1:].strip())
        else:
            # 每遇到 context 或 hunk 邊界，做一輪對齊
            if a_buf or b_buf:
                a = " ".join(a_buf); b = " ".join(b_buf)
                # 句級粗對齊後，抽詞
                a_words = re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{%d,}'%min_len_word, a)
                b_words = re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{%d,}'%min_len_word, b)
                # 簡單配對：長度相近且相似度高
                used = set()
                for aw in a_words:
                    best, score = None, 0
                    for bw in b_words:
                        if bw in used: continue
                        s = fuzz.WRatio(aw, bw)
                        if s > score:
                            best, score = bw, s
                    if best and score >= 90 and aw != best:
                        word_pairs[(aw, best)] += 1
                        used.add(best)
                    # 單字級（長度相等時）
                    if len(aw)==len(best)==1 and aw!=best:
                        char_pairs[(aw, best)] += 1
                a_buf, b_buf = [], []
    return word_pairs, char_pairs

# 針對 auto vs human 的差異做收割
diff_txt = git_diff_path(AUTO_DIR, HUMAN_DIR)
w_pairs, c_pairs = harvest_pairs(diff_txt)

# 更新 manual_map（多字詞）
for (wrong, right), cnt in w_pairs.items():
    if wrong!=right and len(wrong)>=2:
        manual_map[wrong] = right

# 更新 confusion（單字統計）
tmp = defaultdict(lambda: defaultdict(int))
for (a,b), cnt in c_pairs.items():
    tmp[a][b] += cnt
# 合併進既有 confusion
for a, bs in tmp.items():
    dst = confusion.get(a, {})
    for b, c in bs.items():
        dst[b] = dst.get(b, 0) + c
    confusion[a] = dst

# 更新 domain（從 human 最終稿抽詞，偏向專有詞）
def harvest_domain_from_dir(path):
    freq = Counter()
    for root, _, files in os.walk(os.path.join(REPO, path)):
        for f in files:
            if not f.endswith((".txt",".md")): continue
            txt = open(os.path.join(root,f),"r",encoding="utf-8").read()
            for w in re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{2,}', txt):
                # 加權：包含「法、條、第、款、稅、關、估價、協定、價格、報關」等字眼
                bonus = 2 if re.search(r'(法|條|第|款|稅|關|估價|協定|價格|報關)', w) else 1
                freq[w] += bonus
    return {w for w,_ in freq.most_common(4000)}

domain |= harvest_domain_from_dir(HUMAN_DIR)

# 寫回檔案
json.dump(manual_map, open(MAP_PATH,"w",encoding="utf-8"), ensure_ascii=False, indent=2)
json.dump(confusion,  open(CONF_PATH,"w",encoding="utf-8"), ensure_ascii=False, indent=2)
open(DOM_PATH,"w",encoding="utf-8").write("\n".join(sorted(domain)))

print(f"manual_map: {len(manual_map)} 條, confusion: {sum(len(v) for v in confusion.values())} 對, domain: {len(domain)} 詞")
# 需求：pip install rapidfuzz regex import subprocess, re, json, os from collections import Counter, defaultdict from rapidfuzz import fuzz REPO = "/home/jovyan/work" # 改成你的 repo 根目錄 AUTO_DIR = "transcripts/auto" HUMAN_DIR = "transcripts/human" RES = os.path.join(REPO, "resources") os.makedirs(RES, exist_ok=True) MAP_PATH = os.path.join(RES, "manual_map.json") CONF_PATH = os.path.join(RES, "confusion.json") DOM_PATH = os.path.join(RES, "domain_terms.txt") def _load_json(p, default): if os.path.exists(p): return json.load(open(p, "r", encoding="utf-8")) return default manual_map = _load_json(MAP_PATH, {}) confusion = _load_json(CONF_PATH, {}) # dict> domain = set(open(DOM_PATH,"r",encoding="utf-8").read().splitlines() if os.path.exists(DOM_PATH) else []) def git_diff_path(a, b): cmd = ["git","diff","--no-color","--unified=0","--",a,b] out = subprocess.check_output(cmd, cwd=REPO).decode("utf-8", "ignore") return out def harvest_pairs(diff_text, min_len_word=2): # 收集 word 級與 char 級更動 word_pairs = Counter() char_pairs = Counter() a_buf, b_buf = [], [] for line in diff_text.splitlines(): if line.startswith("@@"): a_buf, b_buf = [], [] elif line.startswith("-"): a_buf.append(line[1:].strip()) elif line.startswith("+"): b_buf.append(line[1:].strip()) else: # 每遇到 context 或 hunk 邊界，做一輪對齊 if a_buf or b_buf: a = " ".join(a_buf); b = " ".join(b_buf) # 句級粗對齊後，抽詞 a_words = re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{%d,}'%min_len_word, a) b_words = re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{%d,}'%min_len_word, b) # 簡單配對：長度相近且相似度高 used = set() for aw in a_words: best, score = None, 0 for bw in b_words: if bw in used: continue s = fuzz.WRatio(aw, bw) if s > score: best, score = bw, s if best and score >= 90 and aw != best: word_pairs[(aw, best)] += 1 used.add(best) # 單字級（長度相等時） if len(aw)==len(best)==1 and aw!=best: char_pairs[(aw, best)] += 1 a_buf, b_buf = [], [] return word_pairs, char_pairs # 針對 auto vs human 的差異做收割 diff_txt = git_diff_path(AUTO_DIR, HUMAN_DIR) w_pairs, c_pairs = harvest_pairs(diff_txt) # 更新 manual_map（多字詞） for (wrong, right), cnt in w_pairs.items(): if wrong!=right and len(wrong)>=2: manual_map[wrong] = right # 更新 confusion（單字統計） tmp = defaultdict(lambda: defaultdict(int)) for (a,b), cnt in c_pairs.items(): tmp[a][b] += cnt # 合併進既有 confusion for a, bs in tmp.items(): dst = confusion.get(a, {}) for b, c in bs.items(): dst[b] = dst.get(b, 0) + c confusion[a] = dst # 更新 domain（從 human 最終稿抽詞，偏向專有詞） def harvest_domain_from_dir(path): freq = Counter() for root, _, files in os.walk(os.path.join(REPO, path)): for f in files: if not f.endswith((".txt",".md")): continue txt = open(os.path.join(root,f),"r",encoding="utf-8").read() for w in re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{2,}', txt): # 加權：包含「法、條、第、款、稅、關、估價、協定、價格、報關」等字眼 bonus = 2 if re.search(r'(法|條|第|款|稅|關|估價|協定|價格|報關)', w) else 1 freq[w] += bonus return {w for w,_ in freq.most_common(4000)} domain |= harvest_domain_from_dir(HUMAN_DIR) # 寫回檔案 json.dump(manual_map, open(MAP_PATH,"w",encoding="utf-8"), ensure_ascii=False, indent=2) json.dump(confusion, open(CONF_PATH,"w",encoding="utf-8"), ensure_ascii=False, indent=2) open(DOM_PATH,"w",encoding="utf-8").write("\n".join(sorted(domain))) print(f"manual_map: {len(manual_map)} 條, confusion: {sum(len(v) for v in confusion.values())} 對, domain: {len(domain)} 詞")