#!/usr/bin/env python3
from pathlib import Path
from zipfile import ZipFile
import xml.etree.ElementTree as ET
import shutil, re, json, hashlib, os
from PIL import Image, ImageOps, ImageDraw, ImageFont

NS = {
    'a':'http://schemas.openxmlformats.org/drawingml/2006/main',
    'p':'http://schemas.openxmlformats.org/presentationml/2006/main',
    'r':'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
    'rel':'http://schemas.openxmlformats.org/package/2006/relationships',
}

def natural_key(p):
    s=str(p)
    return [int(t) if t.isdigit() else t for t in re.split(r'(\d+)', s)]

def safe_name(s):
    s = re.sub(r'[\\/:*?"<>|\s]+', '_', s).strip('_')
    return s[:80] or 'ppt'

def parse_rels(z, rel_path):
    rels={}
    if rel_path not in z.namelist(): return rels
    root=ET.fromstring(z.read(rel_path))
    for rel in root.findall('rel:Relationship', NS):
        rid=rel.attrib.get('Id')
        target=rel.attrib.get('Target')
        typ=rel.attrib.get('Type','')
        rels[rid]=(target,typ)
    return rels

def target_to_zip_path(slide_path, target):
    if target.startswith('/'):
        return target.lstrip('/')
    base=Path(slide_path).parent
    return str((base / target).as_posix()).replace('ppt/slides/../','ppt/')

def iter_slides(z):
    return sorted([n for n in z.namelist() if re.match(r'ppt/slides/slide\d+\.xml$', n)], key=natural_key)

def extract_text(root):
    vals=[]
    for t in root.findall('.//a:t', NS):
        if t.text and t.text.strip(): vals.append(t.text.strip())
    return vals

def image_size(path):
    try:
        with Image.open(path) as im: return im.size
    except Exception: return (0,0)

def make_thumb(img_path, size=(240,170)):
    try:
        im=Image.open(img_path).convert('RGB')
        im.thumbnail(size, Image.LANCZOS)
        canvas=Image.new('RGB', size, 'white')
        x=(size[0]-im.width)//2; y=(size[1]-im.height)//2
        canvas.paste(im,(x,y))
        return canvas
    except Exception:
        return Image.new('RGB', size, '#eeeeee')

def draw_wrapped(draw, text, xy, font, fill, max_width, line_spacing=3, max_lines=4):
    x,y=xy; line=''; lines=[]
    for ch in text:
        test=line+ch
        if draw.textbbox((0,0), test, font=font)[2] <= max_width:
            line=test
        else:
            if line: lines.append(line)
            line=ch
    if line: lines.append(line)
    if len(lines)>max_lines:
        lines=lines[:max_lines]
        lines[-1]=lines[-1][:-1]+'…'
    for ln in lines:
        draw.text((x,y), ln, font=font, fill=fill)
        y += font.size + line_spacing
    return y

def main():
    project=Path('/Users/bot1/Volumes/root_for_ai/AI工作区/文博IP_PPT_公司文创开发能力展示_20260604_2014')
    sources=[]
    for p in (project/'source').rglob('*.pptx'):
        sources.append(p)
    sources=sorted(sources, key=lambda p: str(p))
    outroot=project/'work'/'incoming_extract_20260605'
    outroot.mkdir(parents=True, exist_ok=True)
    summary=[]
    font=None
    try:
        font=ImageFont.truetype('/System/Library/Fonts/PingFang.ttc', 14)
        font2=ImageFont.truetype('/System/Library/Fonts/PingFang.ttc', 18)
    except Exception:
        font=ImageFont.load_default(); font2=ImageFont.load_default()
    for ppt in sources:
        name=safe_name(ppt.stem)
        out=outroot/name
        media_out=out/'media'
        slide_out=out/'slides'
        media_out.mkdir(parents=True, exist_ok=True); slide_out.mkdir(parents=True, exist_ok=True)
        deck={'file':str(ppt), 'slides':[]}
        with ZipFile(ppt) as z:
            for idx, sp in enumerate(iter_slides(z), start=1):
                root=ET.fromstring(z.read(sp))
                texts=extract_text(root)
                rel_path='ppt/slides/_rels/'+Path(sp).name+'.rels'
                rels=parse_rels(z, rel_path)
                rids=[]
                for blip in root.findall('.//a:blip', NS):
                    rid=blip.attrib.get('{%s}embed'%NS['r']) or blip.attrib.get('{%s}link'%NS['r'])
                    if rid and rid not in rids: rids.append(rid)
                imgs=[]
                for n,rid in enumerate(rids, start=1):
                    if rid not in rels: continue
                    target,typ=rels[rid]
                    if 'image' not in typ: continue
                    zp=target_to_zip_path(sp,target)
                    if zp not in z.namelist(): continue
                    ext=Path(zp).suffix or '.bin'
                    data=z.read(zp)
                    h=hashlib.sha1(data).hexdigest()[:10]
                    fn=f'slide{idx:03d}_{n:02d}_{h}{ext}'
                    fp=media_out/fn
                    if not fp.exists(): fp.write_bytes(data)
                    w,hgt=image_size(fp)
                    imgs.append({'file':str(fp.relative_to(project)), 'w':w, 'h':hgt, 'zip_path':zp})
                deck['slides'].append({'slide':idx,'texts':texts,'images':imgs})
        # markdown text
        md=[]
        for s in deck['slides']:
            md.append(f"## Slide {s['slide']}")
            if s['texts']:
                md.extend([f"- {t}" for t in s['texts']])
            else: md.append('- [no text]')
            md.append(f"images: {len(s['images'])}\n")
        (out/'slide_text.md').write_text('\n'.join(md), encoding='utf-8')
        (out/'manifest.json').write_text(json.dumps(deck, ensure_ascii=False, indent=2), encoding='utf-8')
        # contact sheet per deck
        tiles=[]
        for s in deck['slides']:
            # choose up to 4 largest imgs
            imgs=sorted(s['images'], key=lambda x:x['w']*x['h'], reverse=True)[:4]
            tile=Image.new('RGB',(560,260),'#f7f3ed')
            d=ImageDraw.Draw(tile)
            d.text((10,8), f"{name} | P{s['slide']}", font=font2, fill='#4b2e20')
            text=' / '.join(s['texts'][:8])[:120]
            draw_wrapped(d, text, (10,36), font, '#222222', 250, max_lines=6)
            x0=280; y0=34
            for j,img in enumerate(imgs):
                thumb=make_thumb(project/img['file'], (130,95))
                x=x0+(j%2)*138; y=y0+(j//2)*104
                tile.paste(thumb,(x,y)); d.rectangle([x,y,x+129,y+94], outline='#d0c2b0')
            tiles.append(tile)
        if tiles:
            cols=2; rows=(len(tiles)+cols-1)//cols
            sheet=Image.new('RGB',(cols*560, rows*260),'white')
            for i,tile in enumerate(tiles): sheet.paste(tile, ((i%cols)*560,(i//cols)*260))
            sheet_path=out/'contact_sheet.jpg'
            sheet.save(sheet_path, quality=88)
        summary.append({'name':name,'ppt':str(ppt),'slides':len(deck['slides']),'images':sum(len(s['images']) for s in deck['slides']),'out':str(out.relative_to(project))})
    (outroot/'summary.json').write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding='utf-8')
    print(json.dumps(summary, ensure_ascii=False, indent=2))

if __name__=='__main__': main()
