收割cell
In [ ]:
Copied!
# 需求:pip install rapidfuzz regex
import subprocess, re, json, os
from collections import Counter, defaultdict
from rapidfuzz import fuzz
REPO = "/home/jovyan/work" # 改成你的 repo 根目錄
AUTO_DIR = "transcripts/auto"
HUMAN_DIR = "transcripts/human"
RES = os.path.join(REPO, "resources")
os.makedirs(RES, exist_ok=True)
MAP_PATH = os.path.join(RES, "manual_map.json")
CONF_PATH = os.path.join(RES, "confusion.json")
DOM_PATH = os.path.join(RES, "domain_terms.txt")
def _load_json(p, default):
if os.path.exists(p):
return json.load(open(p, "r", encoding="utf-8"))
return default
manual_map = _load_json(MAP_PATH, {})
confusion = _load_json(CONF_PATH, {}) # dict<char, dict<char,int>>
domain = set(open(DOM_PATH,"r",encoding="utf-8").read().splitlines() if os.path.exists(DOM_PATH) else [])
def git_diff_path(a, b):
cmd = ["git","diff","--no-color","--unified=0","--",a,b]
out = subprocess.check_output(cmd, cwd=REPO).decode("utf-8", "ignore")
return out
def harvest_pairs(diff_text, min_len_word=2):
# 收集 word 級與 char 級更動
word_pairs = Counter()
char_pairs = Counter()
a_buf, b_buf = [], []
for line in diff_text.splitlines():
if line.startswith("@@"):
a_buf, b_buf = [], []
elif line.startswith("-"):
a_buf.append(line[1:].strip())
elif line.startswith("+"):
b_buf.append(line[1:].strip())
else:
# 每遇到 context 或 hunk 邊界,做一輪對齊
if a_buf or b_buf:
a = " ".join(a_buf); b = " ".join(b_buf)
# 句級粗對齊後,抽詞
a_words = re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{%d,}'%min_len_word, a)
b_words = re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{%d,}'%min_len_word, b)
# 簡單配對:長度相近且相似度高
used = set()
for aw in a_words:
best, score = None, 0
for bw in b_words:
if bw in used: continue
s = fuzz.WRatio(aw, bw)
if s > score:
best, score = bw, s
if best and score >= 90 and aw != best:
word_pairs[(aw, best)] += 1
used.add(best)
# 單字級(長度相等時)
if len(aw)==len(best)==1 and aw!=best:
char_pairs[(aw, best)] += 1
a_buf, b_buf = [], []
return word_pairs, char_pairs
# 針對 auto vs human 的差異做收割
diff_txt = git_diff_path(AUTO_DIR, HUMAN_DIR)
w_pairs, c_pairs = harvest_pairs(diff_txt)
# 更新 manual_map(多字詞)
for (wrong, right), cnt in w_pairs.items():
if wrong!=right and len(wrong)>=2:
manual_map[wrong] = right
# 更新 confusion(單字統計)
tmp = defaultdict(lambda: defaultdict(int))
for (a,b), cnt in c_pairs.items():
tmp[a][b] += cnt
# 合併進既有 confusion
for a, bs in tmp.items():
dst = confusion.get(a, {})
for b, c in bs.items():
dst[b] = dst.get(b, 0) + c
confusion[a] = dst
# 更新 domain(從 human 最終稿抽詞,偏向專有詞)
def harvest_domain_from_dir(path):
freq = Counter()
for root, _, files in os.walk(os.path.join(REPO, path)):
for f in files:
if not f.endswith((".txt",".md")): continue
txt = open(os.path.join(root,f),"r",encoding="utf-8").read()
for w in re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{2,}', txt):
# 加權:包含「法、條、第、款、稅、關、估價、協定、價格、報關」等字眼
bonus = 2 if re.search(r'(法|條|第|款|稅|關|估價|協定|價格|報關)', w) else 1
freq[w] += bonus
return {w for w,_ in freq.most_common(4000)}
domain |= harvest_domain_from_dir(HUMAN_DIR)
# 寫回檔案
json.dump(manual_map, open(MAP_PATH,"w",encoding="utf-8"), ensure_ascii=False, indent=2)
json.dump(confusion, open(CONF_PATH,"w",encoding="utf-8"), ensure_ascii=False, indent=2)
open(DOM_PATH,"w",encoding="utf-8").write("\n".join(sorted(domain)))
print(f"manual_map: {len(manual_map)} 條, confusion: {sum(len(v) for v in confusion.values())} 對, domain: {len(domain)} 詞")
# 需求:pip install rapidfuzz regex import subprocess, re, json, os from collections import Counter, defaultdict from rapidfuzz import fuzz REPO = "/home/jovyan/work" # 改成你的 repo 根目錄 AUTO_DIR = "transcripts/auto" HUMAN_DIR = "transcripts/human" RES = os.path.join(REPO, "resources") os.makedirs(RES, exist_ok=True) MAP_PATH = os.path.join(RES, "manual_map.json") CONF_PATH = os.path.join(RES, "confusion.json") DOM_PATH = os.path.join(RES, "domain_terms.txt") def _load_json(p, default): if os.path.exists(p): return json.load(open(p, "r", encoding="utf-8")) return default manual_map = _load_json(MAP_PATH, {}) confusion = _load_json(CONF_PATH, {}) # dict> domain = set(open(DOM_PATH,"r",encoding="utf-8").read().splitlines() if os.path.exists(DOM_PATH) else []) def git_diff_path(a, b): cmd = ["git","diff","--no-color","--unified=0","--",a,b] out = subprocess.check_output(cmd, cwd=REPO).decode("utf-8", "ignore") return out def harvest_pairs(diff_text, min_len_word=2): # 收集 word 級與 char 級更動 word_pairs = Counter() char_pairs = Counter() a_buf, b_buf = [], [] for line in diff_text.splitlines(): if line.startswith("@@"): a_buf, b_buf = [], [] elif line.startswith("-"): a_buf.append(line[1:].strip()) elif line.startswith("+"): b_buf.append(line[1:].strip()) else: # 每遇到 context 或 hunk 邊界,做一輪對齊 if a_buf or b_buf: a = " ".join(a_buf); b = " ".join(b_buf) # 句級粗對齊後,抽詞 a_words = re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{%d,}'%min_len_word, a) b_words = re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{%d,}'%min_len_word, b) # 簡單配對:長度相近且相似度高 used = set() for aw in a_words: best, score = None, 0 for bw in b_words: if bw in used: continue s = fuzz.WRatio(aw, bw) if s > score: best, score = bw, s if best and score >= 90 and aw != best: word_pairs[(aw, best)] += 1 used.add(best) # 單字級(長度相等時) if len(aw)==len(best)==1 and aw!=best: char_pairs[(aw, best)] += 1 a_buf, b_buf = [], [] return word_pairs, char_pairs # 針對 auto vs human 的差異做收割 diff_txt = git_diff_path(AUTO_DIR, HUMAN_DIR) w_pairs, c_pairs = harvest_pairs(diff_txt) # 更新 manual_map(多字詞) for (wrong, right), cnt in w_pairs.items(): if wrong!=right and len(wrong)>=2: manual_map[wrong] = right # 更新 confusion(單字統計) tmp = defaultdict(lambda: defaultdict(int)) for (a,b), cnt in c_pairs.items(): tmp[a][b] += cnt # 合併進既有 confusion for a, bs in tmp.items(): dst = confusion.get(a, {}) for b, c in bs.items(): dst[b] = dst.get(b, 0) + c confusion[a] = dst # 更新 domain(從 human 最終稿抽詞,偏向專有詞) def harvest_domain_from_dir(path): freq = Counter() for root, _, files in os.walk(os.path.join(REPO, path)): for f in files: if not f.endswith((".txt",".md")): continue txt = open(os.path.join(root,f),"r",encoding="utf-8").read() for w in re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{2,}', txt): # 加權:包含「法、條、第、款、稅、關、估價、協定、價格、報關」等字眼 bonus = 2 if re.search(r'(法|條|第|款|稅|關|估價|協定|價格|報關)', w) else 1 freq[w] += bonus return {w for w,_ in freq.most_common(4000)} domain |= harvest_domain_from_dir(HUMAN_DIR) # 寫回檔案 json.dump(manual_map, open(MAP_PATH,"w",encoding="utf-8"), ensure_ascii=False, indent=2) json.dump(confusion, open(CONF_PATH,"w",encoding="utf-8"), ensure_ascii=False, indent=2) open(DOM_PATH,"w",encoding="utf-8").write("\n".join(sorted(domain))) print(f"manual_map: {len(manual_map)} 條, confusion: {sum(len(v) for v in confusion.values())} 對, domain: {len(domain)} 詞")