# -*- coding: utf-8 -*-
from pathlib import Path
import csv, json, hashlib, shutil, re, subprocess, datetime, os, sys
from collections import Counter
import fitz
from openpyxl import Workbook, load_workbook
from openpyxl.styles import Alignment, Font, PatternFill
from openpyxl.utils import get_column_letter

PROJECT = Path('/Users/bot1/Volumes/root_for_ai/AI工作区/通用_法务合同台账_合同存档_20260607_1036')
DELIV = PROJECT/'deliverables'
DOCS = PROJECT/'docs'
SOURCE = PROJECT/'source'
SOURCE_PACKAGES = PROJECT/'source_packages/20260615_万物有灵合同资料'
EXTRACT_DIR = PROJECT/'work/incoming_archives/20260615_万物有灵合同资料_extracted'
OCR_DIR = PROJECT/'work/extracted_text_ocr'
TEXT_DIR = PROJECT/'work/extracted_text'
PAGE_DIR = PROJECT/'work/page_images'
ZIP_PATH = SOURCE_PACKAGES/'万物有灵合同资料.zip'
INVENTORY_PATH = DOCS/'20260615_万物有灵合同资料_zip_inventory.json'
MANIFEST_PATH = DOCS/'source_files_manifest.json'
OCR_SUMMARY_PATH = DOCS/'ocr_summary.json'
BASE_CSV = DELIV/'合同总表_20260607.csv'
BASE_XLSX = DELIV/'合同总表_20260607.xlsx'
MD_PATH = DELIV/'合同总表_20260607.md'
CATALOG_CSV = DELIV/'可发送文件目录_20260607.csv'
CATALOG_MD = DELIV/'可发送文件目录_20260607.md'
NOTE_PATH = DELIV/'新增法务文档记录_20260615_万物有灵合同资料包.md'
DUP_MD = DELIV/'重复文件清单_20260615_万物有灵合同资料包.md'
README = PROJECT/'README.md'
TODAY = '2026-06-15'
NOW = datetime.datetime.now().isoformat(timespec='seconds')
VISION = Path('/tmp/vision_ocr')

LEDGER_COLS = ['归档编号','签约/生效时间','合同名称','合同类型','项目/IP','甲方','乙方','其他方/合作方','合约有效期/授权期限','授权/合作范围','金额/费用/分成','争议解决','风险/待核对','原始文件','OCR文本','SHA256','页数/OCR字符数']
SRC_COLS = ['归档编号','展示文件名','原始来源路径','存档路径','文本/OCR路径','SHA256','文件大小','页数','文本字符数','添加时间','文件类型','来源压缩包','重复状态']
CAT_COLS = ['归档编号','文件类别','可检索关键词','建议调用场景','展示文件名','存档文件','合同名称','甲方','乙方','有效期/授权期','SHA256','状态/外发提醒']

def sha256(path: Path) -> str:
    h=hashlib.sha256()
    with path.open('rb') as f:
        for b in iter(lambda:f.read(1024*1024), b''):
            h.update(b)
    return h.hexdigest()

def safe_name(name: str) -> str:
    name = name.replace('\x00','').replace('/','_').replace('\\','_').strip()
    name = re.sub(r'\s+', ' ', name)
    name = re.sub(r'[\r\n\t:]+', '_', name)
    return name[:150]

def strip_prefix(stem: str) -> str:
    return re.sub(r'^(?:LEGAL-\d{8}-\d+_|\d{1,3}_)+', '', stem).strip('_ ')

def rel_or_abs(path):
    if not path: return ''
    p=Path(path)
    try:
        return str(p.relative_to(PROJECT))
    except Exception:
        return str(path)

def read_csv(path: Path):
    if not path.exists(): return []
    with path.open('r', encoding='utf-8-sig', newline='') as f:
        return [dict(r) for r in csv.DictReader(f)]

def write_csv(path: Path, rows, cols):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open('w', encoding='utf-8-sig', newline='') as f:
        w=csv.DictWriter(f, fieldnames=cols, extrasaction='ignore')
        w.writeheader()
        for r in rows:
            w.writerow({c:r.get(c,'') for c in cols})

def md_table(path: Path, title: str, rows, cols, note=''):
    lines=[f'# {title}\n']
    if note: lines.append(f'> {note}\n')
    lines.append('| ' + ' | '.join(cols) + ' |')
    lines.append('| ' + ' | '.join(['---']*len(cols)) + ' |')
    def esc(x): return ('' if x is None else str(x)).replace('|','｜').replace('\n','<br>')
    for r in rows:
        lines.append('| ' + ' | '.join(esc(r.get(c,'')) for c in cols) + ' |')
    path.write_text('\n'.join(lines)+'\n', encoding='utf-8')

def load_inventory():
    data=json.loads(INVENTORY_PATH.read_text(encoding='utf-8'))
    return data

def existing_source_hashes():
    d={}
    for base in [SOURCE, PROJECT/'IP应用规范']:
        if not base.exists(): continue
        for p in base.rglob('*'):
            if p.is_file():
                try: d[sha256(p)] = p
                except Exception: pass
    return d

def next_seq():
    nums=[]
    for p in SOURCE.iterdir():
        if p.is_file():
            m=re.match(r'^(\d{1,3})_', p.name)
            if m: nums.append(int(m.group(1)))
    return max(nums or [0])+1

def pdf_native_text(path: Path):
    try:
        doc=fitz.open(path)
        texts=[]
        for page in doc:
            texts.append(page.get_text('text') or '')
        return '\n'.join(texts), doc.page_count
    except Exception as e:
        return f'[PDF文本提取失败：{e}]', 0

def ocr_pdf(path: Path, out_txt: Path):
    out_txt.parent.mkdir(parents=True, exist_ok=True)
    stem=safe_name(path.stem)
    imgdir=PAGE_DIR/stem
    imgdir.mkdir(parents=True, exist_ok=True)
    doc=fitz.open(path)
    imgs=[]
    for i,page in enumerate(doc, start=1):
        img=imgdir/f'page_{i:03d}.png'
        if not img.exists():
            pix=page.get_pixmap(matrix=fitz.Matrix(2,2), alpha=False)
            pix.save(str(img))
        imgs.append(str(img))
    if not VISION.exists():
        raise RuntimeError('/tmp/vision_ocr 不存在')
    # OCR in chunks to avoid very long command lines / timeouts.
    parts=[]
    for start in range(0, len(imgs), 8):
        chunk=imgs[start:start+8]
        cp=subprocess.run([str(VISION), *chunk], text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=max(120, 60*len(chunk)))
        if cp.returncode != 0:
            parts.append(f'\n[Vision OCR failed chunk {start//8+1}: {cp.stderr[:500]}]\n')
        else:
            parts.append(cp.stdout)
    text='\n'.join(parts).strip()
    out_txt.write_text(text, encoding='utf-8')
    return text, doc.page_count

def extract_text_for(path: Path, archive_id: str):
    ext=path.suffix.lower()
    if ext == '.pdf':
        native, pages = pdf_native_text(path)
        if len(native.strip()) >= 800:
            out=TEXT_DIR/f'{sha256(path)[:12]}_{safe_name(strip_prefix(path.stem))}_pymupdf.txt'
            out.parent.mkdir(parents=True, exist_ok=True)
            out.write_text(native, encoding='utf-8')
            return out, len(native), pages, 'native'
        else:
            out=OCR_DIR/f'{safe_name(path.stem)}.txt'
            if out.exists() and out.stat().st_size > 20:
                text=out.read_text(encoding='utf-8', errors='ignore')
                return out, len(text), pages, 'ocr_cached'
            text, pages = ocr_pdf(path, out)
            return out, len(text), pages, 'ocr'
    elif ext in ['.jpg','.jpeg','.png','.webp']:
        # Existing image certificate OCRs are usually already present.
        out=OCR_DIR/f'{safe_name(path.stem)}.txt'
        if out.exists():
            text=out.read_text(encoding='utf-8', errors='ignore')
            return out, len(text), 1, 'ocr_cached'
        cp=subprocess.run([str(VISION), str(path)], text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=180)
        text=cp.stdout if cp.returncode==0 else f'[Vision OCR failed: {cp.stderr[:500]}]'
        out.write_text(text, encoding='utf-8')
        return out, len(text), 1, 'ocr'
    elif ext == '.docx':
        try:
            from docx import Document
            doc=Document(str(path))
            text='\n'.join(p.text for p in doc.paragraphs)
        except Exception as e:
            text=f'[DOCX提取失败：{e}]'
        out=TEXT_DIR/f'{sha256(path)[:12]}_{safe_name(strip_prefix(path.stem))}.txt'
        out.parent.mkdir(parents=True, exist_ok=True); out.write_text(text,encoding='utf-8')
        return out, len(text), '', 'docx'
    else:
        return Path(''), 0, '', 'binary'

def pick(regex, text, maxlen=80):
    m=re.search(regex, text, flags=re.S)
    if not m: return ''
    val=re.sub(r'\s+', '', m.group(1))
    val=re.split(r'[。；;\n]', val)[0]
    return val[:maxlen]

def classify(name: str, source_rel: str, text: str):
    low=(name+' '+source_rel).lower()
    party_a = pick(r'甲\s*方[：:：\s]*([^\n]{2,80})', text) or 'OCR未可靠提取；以原件为准'
    party_b = pick(r'乙\s*方[：:：\s]*([^\n]{2,80})', text) or 'OCR未可靠提取；以原件为准'
    other = ''
    ctype='合同/协议'
    project='万物有灵合同资料'
    scope='合同/协议原件留存及后续按关键词调取；具体授权、服务或采购范围以原件/OCR文本为准'
    money='未在台账摘录；如有金额/账号/开票等敏感信息，以原件内部复核为准'
    dispute='见原件争议解决条款；OCR未可靠汇总'
    term='见原件/OCR文本；签署页日期、手写日期、盖章日期需人工复核'
    risk='中；本行由文件名和OCR辅助归类，签章主体、日期、金额、附件及授权链需人工复核。'
    scene='需要调取该合同/协议原件或核对条款时调用'
    status='外发/正式使用前核对签章、期限、授权范围和是否为最终版本'

    if '法律顾问' in name or '法务服务' in name or '海蜂' in name and '法务' in name:
        ctype='法务服务合同'; project='常年法律顾问/法务服务'; scope='法律顾问/法务服务范围、服务期限、费用与争议解决等'; scene='查询法务顾问服务期限、服务额度、服务范围、付款和争议解决时调用'
    elif '房租' in name or '租赁' in name:
        ctype='房屋租赁合同'; project='办公/经营场所租赁'; scope='房屋租赁、租金、物业/费用、租期、解除和违约等'; scene='查询办公场地租赁期限、租金、押金、费用承担或续租/退租时调用'; risk='中；租期、租金、押金、交付状态、转租/装修/提前解除及签章日期需人工复核。'
    elif '屏销宝' in name or '电淘' in name or '电视淘宝' in text[:1000]:
        ctype='广告投放/技术服务合同'; project='电视淘宝屏销宝/电淘投放'; scope='电视淘宝/屏销宝技术服务、年度框架投放、曝光量/资源位等'; scene='查询屏销宝/电淘投放金额、服务周期、曝光量、付款、退款/补量和责任上限时调用'; risk='中高；通常存在预付费、未达曝光补量而非退款、服务方责任上限、单方调整规则等风险，金额/曝光量/最终版本需复核。'
        money=pick(r'(?:合同价|实际合同价|价款|费用|金额)[^\n]{0,20}([0-9,.，万万元人民币]+)', text) or money
    elif '授权' in name or 'ip' in low or '品牌授权' in name or '良渚' in name or '故宫' in name or '方程豹' in name or '敦煌' in name or '中国文字博物馆' in name:
        if '代理' in name:
            ctype='品牌/IP授权代理合同'
        else:
            ctype='IP授权/品牌合作合同'
        project='IP授权/品牌合作'
        scope='IP/品牌授权、代理、联名合作、商品开发或推广使用；具体范围、品类、地域、转授权边界以原件为准'
        scene='对外合作、招商洽谈、IP授权链核查、商用/外发前复核授权范围和期限时调用'
        risk='中高；需核对授权主体、权利来源、授权期限、地域/品类、转授权/第三方使用审批、素材使用边界及到期状态。'
        status='外发前必须核对是否仍在期限内、是否允许转授权/第三方使用、是否需书面审批或原件'
    elif '采购' in name or '订单' in name or '定制合作' in name or '文化产品合作' in name or '行李牌' in name or '冰箱贴' in name:
        ctype='产品合作/采购订单'; project='文创/商品采购与定制合作'; scope='产品设计、采购、定制、交付、验收、结算、质量/IP责任等'; scene='查询供应商、采购/定制产品、金额、交付验收、质量/IP责任和结算时调用'; risk='中高；需复核数量、单价、交期、验收标准、付款条件、质量/IP责任、主合同/订单对应关系。'
    elif '居间' in name or '解约金' in name:
        ctype='居间/服务协议'; project='云南白药解约金/居间事项'; scope='居间服务、促成事项、报酬/付款条件、保密和违约等'; scene='查询居间事项、解约金/报酬、付款触发条件和争议解决时调用'; risk='中高；需核对居间成功标准、付款触发条件、禁止跳单、税费、证据留存和争议解决。'
    elif '连带责任' in name or '承诺函' in name:
        ctype='承诺函/担保责任文件'; project='国博项目/秘色破茧连带责任'; scope='承诺/担保/连带责任范围，以原件为准'; scene='核查承诺主体、责任范围、连带责任和项目付款/履约风险时调用'; risk='高；涉及连带责任或担保性质，需律师复核责任范围、期限、触发条件和公司授权。'
    elif '保密' in name or 'nda' in low:
        ctype='保密协议/NDA'; project='产品开发/资料披露保密'; scope='保密信息范围、保密期限、违约责任和资料返还/销毁'; scene='对外合作前保密约束、产品开发资料披露前、追责泄密时调用'; risk='中；需复核保密范围、例外、期限、违约金、资料返还/销毁和管辖。'

    # filename-known parties/project improvements without exposing sensitive fields
    if '万物有灵' in name and party_a.startswith('OCR未'): party_a='杭州万物有灵文化科技有限公司或相关合作方（以原件为准）'
    if '鲜活' in name and party_a.startswith('OCR未'): party_a='杭州鲜活万物相关主体或合作方（以原件为准）'
    if '元浪' in name and party_b.startswith('OCR未'): other='杭州元浪文化科技有限公司（文件名出现，需以原件确认角色）'
    if '秘色' in name and party_b.startswith('OCR未'): other='杭州秘色破茧文化创意有限公司（文件名出现，需以原件确认角色）'
    if '华数' in name or '屏销宝' in name: party_a='杭州华数智屏信息技术有限公司/电视淘宝相关主体（以原件为准）'
    if '极是' in source_rel or '极是' in name: other='极是（广州）生物技术有限公司/相关品牌方（以原件确认）'
    if '威莱' in name: project += '；威莱'
    if 'LA' in name or 'la' in low: project += '；LA'
    if '纺优美' in name: project += '；纺优美'
    if '菁华' in name: project += '；菁华'
    if '王小卤' in name: project += '；王小卤'
    if '方程豹' in name: project='比亚迪方程豹IP授权'
    if '敦煌' in name: project='敦煌美术研究所IP/相关合作'
    if '中国文字博物馆' in name: project='中国文字博物馆IP/相关合作'
    if '国博' in name or '国家博物馆' in name: project='中国国家博物馆/国博相关项目'
    if '良渚' in name: project='良渚文化/良渚博物院IP'
    if '故宫' in name or '养心殿' in name: project='故宫/养心殿相关IP授权代理'
    if '西安博物院' in name: project='西安博物院项目'
    if '自贸区' in name: project='中国工艺美术馆/自贸区相关采购'
    if '云南白药' in name: project='云南白药解约金/居间事项'

    # date snippets for convenience only
    dates=re.findall(r'20\d{2}\s*[年./-]\s*\d{1,2}\s*[月./-]\s*\d{0,2}\s*日?', text[:8000])
    if dates:
        term='；'.join(dates[:4]) + '（OCR日期片段，需结合原件判断签署/期限）'
    return ctype, project, party_a, party_b, other, term, scope, money, dispute, risk, scene, status

def build_row(archive_id, path: Path, original_rel: str, source_package='', duplicate_status=''):
    text_path, chars, pages, mode = extract_text_for(path, archive_id)
    text = ''
    if text_path and str(text_path) and Path(text_path).exists() and Path(text_path).is_file():
        text = Path(text_path).read_text(encoding='utf-8', errors='ignore')
    name = strip_prefix(path.stem)
    ctype, project, pa, pb, other, term, scope, money, dispute, risk, scene, status = classify(name, original_rel, text)
    return {
        'ledger': {
            '归档编号': archive_id,
            '签约/生效时间': term,
            '合同名称': name,
            '合同类型': ctype,
            '项目/IP': project,
            '甲方': pa,
            '乙方': pb,
            '其他方/合作方': other,
            '合约有效期/授权期限': term,
            '授权/合作范围': scope,
            '金额/费用/分成': money,
            '争议解决': dispute,
            '风险/待核对': risk,
            '原始文件': str(path),
            'OCR文本': str(text_path) if text_path else '',
            'SHA256': sha256(path),
            '页数/OCR字符数': f'{path.suffix.upper().lstrip(".")} {pages}页；{mode}；{chars}字' if pages else f'{path.suffix.upper().lstrip(".")}；{mode}；{chars}字',
        },
        'source': {
            '归档编号': archive_id,
            '展示文件名': name + path.suffix,
            '原始来源路径': original_rel,
            '存档路径': str(path),
            '文本/OCR路径': str(text_path) if text_path else '',
            'SHA256': sha256(path),
            '文件大小': str(path.stat().st_size),
            '页数': str(pages),
            '文本字符数': str(chars),
            '添加时间': NOW,
            '文件类型': path.suffix.lower().lstrip('.'),
            '来源压缩包': source_package,
            '重复状态': duplicate_status,
        },
        'catalog_extra': {'scene': scene, 'status': status}
    }

def category_for(row):
    return row.get('合同类型','合同/协议')

def scene_for(row):
    c=row.get('合同类型','')
    if '授权' in c: return '对外合作、招商洽谈、IP授权链核查、商用/外发前复核授权范围和期限时调用'
    if '屏销宝' in row.get('项目/IP','') or '广告投放' in c: return '查询电淘/屏销宝投放合同、金额、曝光量、付款、退款/补量、责任上限时调用'
    if '采购' in c or '产品合作' in c: return '查询供应商、采购金额、交付/验收、产品质量/IP责任和结算时调用'
    if '法务' in c: return '查询法务服务期限、服务额度、顾问服务范围、仲裁/诉讼条款时调用'
    if '房屋' in c: return '查询房租、租期、押金、费用承担、续租/退租时调用'
    if '保密' in c: return '对外合作前保密约束、资料披露前、追责泄密时调用'
    return '需要调取该合同/协议原件或核对条款时调用'

def main():
    DELIV.mkdir(exist_ok=True); DOCS.mkdir(exist_ok=True); SOURCE.mkdir(exist_ok=True); OCR_DIR.mkdir(parents=True, exist_ok=True); TEXT_DIR.mkdir(parents=True, exist_ok=True)
    inv=load_inventory()
    zip_sha=sha256(ZIP_PATH)
    existing_hash=existing_source_hashes()

    # Load existing user-facing rows (new-style rows currently in workbook/csv).
    current_rows=[]
    current_src=[]
    current_by_sha={}
    for r in read_csv(BASE_CSV):
        rr={c:r.get(c,'') for c in LEDGER_COLS}
        if not rr.get('归档编号') and r.get('序号'):
            rr['归档编号']=r.get('归档编号','') or f'LEGACY-{r.get("序号")}'
        if rr.get('SHA256'):
            current_by_sha[rr['SHA256']]=rr
        if rr.get('归档编号'):
            current_rows.append(rr)

    # Add all existing source/IP files that are not in current rows (repair older source rows 01-28).
    built=[]
    source_files=[]
    for base in [SOURCE, PROJECT/'IP应用规范']:
        if not base.exists(): continue
        for p in sorted(base.rglob('*')):
            if not p.is_file(): continue
            s=sha256(p)
            if s in current_by_sha:
                continue
            # assign archive id from numeric prefix when available
            m=re.match(r'^(\d{1,3})_', p.name)
            if m:
                aid=f'LC-20260607-{int(m.group(1)):03d}'
            else:
                aid=f'LEGACY-{s[:8]}'
            built.append(build_row(aid, p, rel_or_abs(p), '', 'existing_source_repaired'))
            current_by_sha[s]=built[-1]['ledger']

    # Copy and process new unique files from this upload. Duplicates are not copied again.
    nextn=next_seq()
    uploaded_duplicates=[]
    new_processed=[]
    for f in inv['files']:
        src=Path(inv['extract_dir'])/f['rel']
        if not src.is_file(): continue
        s=f['sha256']
        if s in existing_hash:
            uploaded_duplicates.append({**f, 'existing_path': str(existing_hash[s])})
            continue
        # If a previous run copied it, use that file instead of re-copying.
        if s in current_by_sha:
            uploaded_duplicates.append({**f, 'existing_path': current_by_sha[s].get('原始文件','')})
            continue
        clean=safe_name(Path(f['rel']).name)
        dest=SOURCE/f'{nextn:02d}_{clean}'
        while dest.exists():
            nextn += 1
            dest=SOURCE/f'{nextn:02d}_{clean}'
        shutil.copy2(src, dest)
        aid=f'LC-20260615-{nextn:03d}'
        b=build_row(aid, dest, f['rel'], str(ZIP_PATH), 'new_from_20260615_zip')
        built.append(b); new_processed.append(b)
        current_by_sha[s]=b['ledger']; existing_hash[s]=dest
        nextn += 1

    # Combine rows: current rows + built source repairs/new; de-dupe by SHA keeping first current row, then built.
    all_rows=[]; seen=set()
    for r in current_rows + [b['ledger'] for b in built]:
        key=r.get('SHA256') or r.get('归档编号')
        if key in seen: continue
        seen.add(key); all_rows.append({c:r.get(c,'') for c in LEDGER_COLS})

    # Source manifest rows from all ledger files.
    src_rows=[]
    for r in all_rows:
        p=Path(r.get('原始文件',''))
        src_rows.append({
            '归档编号': r.get('归档编号',''),
            '展示文件名': p.name if p.name else r.get('合同名称',''),
            '原始来源路径': '',
            '存档路径': r.get('原始文件',''),
            '文本/OCR路径': r.get('OCR文本',''),
            'SHA256': r.get('SHA256',''),
            '文件大小': str(p.stat().st_size) if p.exists() else '',
            '页数': r.get('页数/OCR字符数',''),
            '文本字符数': '',
            '添加时间': NOW,
            '文件类型': p.suffix.lower().lstrip('.') if p.name else '',
            '来源压缩包': str(ZIP_PATH) if any(r.get('SHA256')==b['ledger'].get('SHA256') for b in new_processed) else '',
            '重复状态': '',
        })

    catalog=[]
    for r in all_rows:
        filecat=category_for(r)
        keywords='、'.join([x for x in [r.get('合同名称',''), r.get('项目/IP',''), r.get('甲方',''), r.get('乙方',''), filecat, Path(r.get('原始文件','')).name] if x])[:500]
        status='外发/正式使用前核对签章、期限、授权范围、金额及是否最终版本；OCR字段需人工复核'
        if '授权' in filecat:
            status='授权类文件外发前需核对是否仍在期限内、是否允许转授权/第三方使用、是否需原件或书面审批'
        catalog.append({
            '归档编号': r.get('归档编号',''), '文件类别': filecat, '可检索关键词': keywords,
            '建议调用场景': scene_for(r), '展示文件名': Path(r.get('原始文件','')).name, '存档文件': r.get('原始文件',''),
            '合同名称': r.get('合同名称',''), '甲方': r.get('甲方',''), '乙方': r.get('乙方',''),
            '有效期/授权期': r.get('合约有效期/授权期限',''), 'SHA256': r.get('SHA256',''), '状态/外发提醒': status,
        })

    # Sort stable by archive id date then LC numeric.
    def sortkey(r):
        aid=r.get('归档编号','')
        nums=re.findall(r'(\d{8})-(\d+)$', aid)
        if nums:
            return (nums[0][0], int(nums[0][1]), aid)
        return ('99999999', 999999, aid)
    all_rows.sort(key=sortkey); src_rows.sort(key=sortkey); catalog.sort(key=sortkey)

    write_csv(BASE_CSV, all_rows, LEDGER_COLS)
    write_csv(CATALOG_CSV, catalog, CAT_COLS)
    md_table(MD_PATH, '合同总表（截至2026-06-15）', all_rows, LEDGER_COLS, '本表根据PDF文本层/OCR和文件名辅助分类整理；签署日期、印章、手写内容、金额、附件、授权链均需以原件人工复核。银行账号、身份证号、电话、税号等敏感信息不在本表展开。')
    md_table(CATALOG_MD, '可发送文件目录（截至2026-06-15）', catalog, CAT_COLS, '用于后续按关键词/类别快速找到原件并发送；授权类和合同原件外发前需核对是否过期、是否最终版本、是否需书面审批。')

    dup_rows=[]
    for i,d in enumerate(uploaded_duplicates,1):
        dup_rows.append({'序号':str(i),'压缩包内路径':d['rel'],'SHA256':d['sha256'],'已存在路径':d.get('existing_path') or d.get('duplicate_existing',''),'处理':'按SHA256判定重复，未重复入账/未再次复制为新存档文件'})
    if dup_rows:
        md_table(DUP_MD, '重复文件清单（2026-06-15 万物有灵合同资料包）', dup_rows, ['序号','压缩包内路径','SHA256','已存在路径','处理'])

    # Manifest JSON, preserving object shape.
    manifest={'updated_at': NOW, 'project': str(PROJECT), 'files': []}
    for r in src_rows:
        manifest['files'].append({
            'archive_id': r['归档编号'], 'source_filename': r['展示文件名'], 'stored_path': r['存档路径'], 'sha256': r['SHA256'],
            'size_bytes': r['文件大小'], 'extracted_text_path': r['文本/OCR路径'], 'note': '合同台账统一重建；敏感信息仅原件和文本留存。'
        })
    MANIFEST_PATH.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding='utf-8')
    ocr_summary=[]
    for r in src_rows:
        p=r['文本/OCR路径']; chars=''
        if p and Path(p).exists():
            try: chars=str(len(Path(p).read_text(encoding='utf-8', errors='ignore')))
            except Exception: pass
        ocr_summary.append({'archive_id':r['归档编号'],'file':r['展示文件名'],'ocr_text':p,'chars':chars,'sha256':r['SHA256']})
    OCR_SUMMARY_PATH.write_text(json.dumps(ocr_summary, ensure_ascii=False, indent=2), encoding='utf-8')

    # Excel workbook.
    wb=Workbook(); wb.remove(wb.active)
    def add_sheet(title, rows, cols):
        ws=wb.create_sheet(title)
        ws.append(cols)
        for r in rows: ws.append([r.get(c,'') for c in cols])
        return ws
    add_sheet('合同总表', all_rows, LEDGER_COLS)
    add_sheet('源文件清单', src_rows, SRC_COLS)
    add_sheet('可发送文件目录', catalog, CAT_COLS)
    add_sheet('本批重复文件', dup_rows, ['序号','压缩包内路径','SHA256','已存在路径','处理'])
    process_rows=[
        {'时间':NOW,'事项':'处理上传压缩包','说明':f'万物有灵合同资料.zip；SHA256={zip_sha}；压缩包内文件{len(inv["files"])}个。'},
        {'时间':NOW,'事项':'去重结果','说明':f'压缩包内{len(uploaded_duplicates)}个文件与既有source文件SHA256重复，未重复入账；新增复制并入账{len(new_processed)}个文件；同时修复/补录既有source存档文件{len(built)-len(new_processed)}个。'},
        {'时间':NOW,'事项':'归类规则','说明':'按合同/授权/IP代理/产品合作采购/广告投放技术服务/法务服务/房租/居间/承诺函等分类；敏感字段不在总表展开。'},
        {'时间':NOW,'事项':'可靠性提示','说明':'扫描PDF主要依赖OCR，日期、印章、金额、附件、授权链、是否最终签署版均需人工复核；本台账为法务辅助，不替代律师意见。'},
    ]
    add_sheet('处理说明', process_rows, ['时间','事项','说明'])
    field_rows=[{'字段':c,'说明':'见表头；敏感信息不在衍生表展开，原件/OCR留存备查。'} for c in LEDGER_COLS]
    add_sheet('字段说明', field_rows, ['字段','说明'])
    for ws in wb.worksheets:
        ws.freeze_panes='A2'
        for cell in ws[1]:
            cell.font=Font(bold=True, color='FFFFFF')
            cell.fill=PatternFill('solid', fgColor='4F81BD')
            cell.alignment=Alignment(horizontal='center', vertical='center', wrap_text=True)
        for row in ws.iter_rows(min_row=2):
            for cell in row:
                cell.alignment=Alignment(vertical='top', wrap_text=True)
        for col in range(1, ws.max_column+1):
            letter=get_column_letter(col)
            max_len=0
            for cell in ws[letter]:
                val='' if cell.value is None else str(cell.value)
                max_len=max(max_len, min(len(val), 60))
            ws.column_dimensions[letter].width=max(10, min(max_len+2, 45))
        ws.auto_filter.ref=ws.dimensions
    wb.save(BASE_XLSX)

    # Batch note.
    cat_counts=Counter(r['合同类型'] for r in all_rows if any(r.get('SHA256')==b['ledger'].get('SHA256') for b in new_processed))
    repaired_count=len(built)-len(new_processed)
    note=[]
    note.append('# 新增法务文档记录（2026-06-15｜万物有灵合同资料包）\n')
    note.append(f'处理时间：{NOW}\n')
    note.append(f'- 来源压缩包：`{ZIP_PATH}`')
    note.append(f'- 压缩包 SHA256：`{zip_sha}`')
    note.append(f'- 压缩包内文件：{len(inv["files"])} 个')
    note.append(f'- 新增入账：{len(new_processed)} 个')
    note.append(f'- SHA256重复未重复入账：{len(uploaded_duplicates)} 个')
    note.append(f'- 同步修复既有 source 存档记录：{repaired_count} 个')
    note.append('\n## 本批新增文件分类\n')
    for k,v in cat_counts.most_common(): note.append(f'- {k}：{v} 个')
    note.append('\n## 重要提醒\n')
    note.append('- 本批多数为扫描 PDF，OCR 只用于检索和初步归类；签章页、签署日期、金额、附件空格、授权链、是否最终版本需人工复核。')
    note.append('- 授权/IP相关文件外发前需核对是否仍在期限内、是否允许转授权/第三方使用、是否需书面审批或原件。')
    note.append('- 屏销宝/电淘类合同需重点复核付款、退款/补量、曝光量、责任上限和最终有效版本。')
    note.append('- 台账/目录未展开银行账号、个人电话、税号、身份证号等敏感字段；原件和OCR文本仅用于内部复核。')
    note.append('\n## 本批新增清单\n')
    note.append('| 归档编号 | 合同名称 | 分类 | 项目/IP | 风险提示 |')
    note.append('|---|---|---|---|---|')
    for b in new_processed:
        r=b['ledger']
        note.append('| ' + ' | '.join(str(r.get(c,'' )).replace('|','｜').replace('\n',' ')[:180] for c in ['归档编号','合同名称','合同类型','项目/IP','风险/待核对']) + ' |')
    NOTE_PATH.write_text('\n'.join(note)+'\n', encoding='utf-8')

    # README summary.
    rt=README.read_text(encoding='utf-8') if README.exists() else '# 法务合同台账与合同存档\n'
    line=f'- 当前台账：截至2026-06-15共{len(all_rows)}条记录；本次万物有灵合同资料包新增{len(new_processed)}条，重复{len(uploaded_duplicates)}条未重复入账。'
    if '- 当前台账：' in rt:
        rt='\n'.join([line if l.startswith('- 当前台账：') else l for l in rt.splitlines()])+'\n'
    else:
        rt=rt.rstrip()+'\n'+line+'\n'
    README.write_text(rt, encoding='utf-8')

    # Verify workbook.
    wb2=load_workbook(BASE_XLSX, read_only=True, data_only=True)
    verify={ws.title:(ws.max_row, ws.max_column) for ws in wb2.worksheets}
    expected=len(all_rows)+1
    actual=wb2['合同总表'].max_row
    if actual != expected:
        raise RuntimeError(f'Workbook row mismatch: expected {expected}, got {actual}')
    missing=[r['原始文件'] for r in all_rows if r.get('原始文件') and not Path(r['原始文件']).exists()]
    if missing:
        raise RuntimeError(f'Missing stored files: {missing[:5]}')
    print(json.dumps({
        'ok': True, 'zip_sha256': zip_sha, 'zip_files': len(inv['files']), 'new_added': len(new_processed),
        'duplicates': len(uploaded_duplicates), 'repaired_existing_source_rows': repaired_count,
        'ledger_rows': len(all_rows), 'workbook_sheets': verify,
        'xlsx': str(BASE_XLSX), 'catalog': str(CATALOG_CSV), 'note': str(NOTE_PATH), 'duplicates_note': str(DUP_MD) if dup_rows else '',
        'category_counts_new': dict(cat_counts)
    }, ensure_ascii=False, indent=2))

if __name__ == '__main__':
    main()
