#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os, re, json, time, hashlib, urllib.parse, urllib.request, shutil, ssl
from pathlib import Path
from PIL import Image, ImageOps, ImageDraw, ImageFont, UnidentifiedImageError

ROOT = Path('/Users/bot1/Volumes/root_for_ai/AI工作区/国博_图片素材_馆藏八大件_20260612_1857')
IMG_ROOT = ROOT/'images'
CACHE = ROOT/'cache'
DOCS = ROOT/'docs'
SOURCES = ROOT/'sources'
for p in [IMG_ROOT, CACHE, DOCS, SOURCES, ROOT/'contact_sheets', ROOT/'deliverables']:
    p.mkdir(parents=True, exist_ok=True)

ITEMS = [
    {"id":"01_陶鹰鼎", "name":"陶鹰鼎", "aliases":["陶鹰鼎","陶鹰","鹰鼎"], "queries":["陶鹰鼎 中国国家博物馆 高清", "陶鹰鼎 高清 大图", "国博 陶鹰鼎", "陶鹰鼎 侧面 细节"]},
    {"id":"02_后母戊鼎", "name":"后母戊鼎", "aliases":["后母戊鼎","司母戊鼎","后母戊","司母戊"], "queries":["后母戊鼎 中国国家博物馆 高清", "司母戊鼎 高清 大图", "国博 后母戊鼎", "后母戊鼎 侧面 细节"]},
    {"id":"03_青铜冰鉴", "name":"青铜冰鉴", "aliases":["青铜冰鉴","冰鉴"], "queries":["青铜冰鉴 中国国家博物馆 高清", "青铜冰鉴 高清 大图", "国博 青铜冰鉴", "青铜冰鉴 细节"]},
    {"id":"04_击鼓说唱俑", "name":"击鼓说唱俑", "aliases":["击鼓说唱俑","说唱俑","击鼓俑"], "queries":["击鼓说唱俑 中国国家博物馆 高清", "击鼓说唱俑 高清 大图", "国博 击鼓说唱俑", "击鼓说唱俑 侧面 细节"]},
    {"id":"05_青瓷莲花尊", "name":"青瓷莲花尊", "aliases":["青瓷莲花尊","莲花尊"], "queries":["青瓷莲花尊 中国国家博物馆 高清", "青瓷莲花尊 高清 大图", "国博 青瓷莲花尊", "青瓷莲花尊 细节"]},
    {"id":"06_载乐骆驼", "name":"载乐骆驼", "aliases":["载乐骆驼","三彩载乐骆驼","骆驼载乐俑","唐三彩载乐骆驼"], "queries":["载乐骆驼 中国国家博物馆 高清", "三彩载乐骆驼 高清 大图", "国博 载乐骆驼", "载乐骆驼 侧面 细节"]},
    {"id":"07_绿釉鸱吻", "name":"绿釉鸱吻", "aliases":["绿釉鸱吻","鸱吻"], "queries":["绿釉鸱吻 中国国家博物馆 高清", "绿釉鸱吻 高清 大图", "国博 绿釉鸱吻", "绿釉鸱吻 细节"]},
    {"id":"08_海晏河清尊", "name":"海晏河清尊", "aliases":["海晏河清尊","海晏河清"], "queries":["海晏河清尊 中国国家博物馆 高清", "海晏河清尊 高清 大图", "国博 海晏河清尊", "海晏河清尊 细节"]},
]

OFFICIAL_PAGES = {
    "02_后母戊鼎": [
        "https://www.chnmuseum.cn/Portals/0/web/zt/100n/guobao_content-4.html?id=26",
        "https://www.chnmuseum.cn/sp/gbzp/rggbhsh/202203/t20220303_254156.shtml",
    ],
    "03_青铜冰鉴": ["https://www.chnmuseum.cn/zp/zpml/kgfjp/202008/t20200824_247219.shtml"],
    "05_青瓷莲花尊": ["https://www.chnmuseum.cn/Portals/0/web/zt/100n/guobao_content-7.html?id=29"],
}

UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/125 Safari/537.36'

def fetch(url, timeout=25, binary=False, referer=None):
    req = urllib.request.Request(url, headers={'User-Agent': UA, 'Accept': '*/*'})
    if referer: req.add_header('Referer', referer)
    try:
        with urllib.request.urlopen(req, timeout=timeout) as r:
            data = r.read()
        return data if binary else data.decode('utf-8','ignore')
    except Exception as e:
        return None

def norm_url(u, base=None):
    if not u: return None
    u = u.replace('\\', '/')
    u = u.strip()
    if base:
        u = urllib.parse.urljoin(base, u)
    return u

def extract_images_from_page(url):
    txt = fetch(url, referer='https://www.chnmuseum.cn/')
    out=[]
    if not txt: return out
    (SOURCES/'official_pages_raw.txt').open('a', encoding='utf-8').write(f"\n\n===== {url} =====\n"+txt[:5000])
    title = ''
    m = re.search(r'<title[^>]*>(.*?)</title>', txt, re.I|re.S)
    if m: title = re.sub('<.*?>','',m.group(1)).strip()
    for m in re.finditer(r'(?:src|href|data-src|zoomfile|file)=["\']([^"\']+?\.(?:jpg|jpeg|png|webp)(?:\?[^"\']*)?)["\']', txt, re.I):
        im = norm_url(m.group(1), url)
        if not im: continue
        low=im.lower()
        if any(x in low for x in ['logo','ewm','icon','header','footer','search','download','blue.png','close.png','wx','wb']):
            continue
        if im not in [x['url'] for x in out]:
            out.append({'url': im, 'source_page': url, 'title': title, 'engine':'official_page'})
    # background:url(...)
    for m in re.finditer(r'url\(([^\)]+?\.(?:jpg|jpeg|png|webp))\)', txt, re.I):
        im = norm_url(m.group(1).strip('"\''), url)
        if im and im not in [x['url'] for x in out]:
            out.append({'url': im, 'source_page': url, 'title': title, 'engine':'official_bg'})
    return out

def image_so_candidates(query, pages=3):
    out=[]
    sid=''
    for page in range(pages):
        sn=page*30
        url='https://image.so.com/j?' + urllib.parse.urlencode({
            'q': query, 'pd':'1','pn':'30','correct':query,'adstar':'0','tab':'all','sid':sid,'ras':'0','cn':'0','gn':'0','kn':'50','crn':'0','bxn':'20','cuben':'0','pornn':'0','manun':'50','src':'srp','sn':str(sn)
        })
        txt=fetch(url, referer='https://image.so.com/')
        if not txt: continue
        try:
            data=json.loads(txt)
        except Exception:
            continue
        sid=data.get('sid') or sid
        for d in data.get('list',[]):
            im=d.get('img') or d.get('thumb') or d.get('qhimg_url')
            if not im: continue
            out.append({
                'url': im, 'source_page': d.get('link') or d.get('sourceurl') or '',
                'title': d.get('title') or d.get('grpmd5') or '',
                'engine': 'image.so', 'query': query,
                'reported_width': d.get('width'), 'reported_height': d.get('height'), 'source': d.get('src') or d.get('site') or ''
            })
        time.sleep(0.5)
    return out

def wikimedia_candidates(query):
    out=[]
    api='https://commons.wikimedia.org/w/api.php'
    params={
        'action':'query','generator':'search','gsrsearch':query,'gsrnamespace':'6','gsrlimit':'20',
        'prop':'imageinfo','iiprop':'url|size|mime','format':'json','origin':'*'
    }
    txt=fetch(api+'?'+urllib.parse.urlencode(params), referer='https://commons.wikimedia.org/')
    if not txt: return out
    try: data=json.loads(txt)
    except Exception: return out
    for page in (data.get('query',{}).get('pages',{}) or {}).values():
        ii=(page.get('imageinfo') or [{}])[0]
        u=ii.get('url')
        if u:
            out.append({'url':u,'source_page':'https://commons.wikimedia.org/wiki/'+urllib.parse.quote(page.get('title','').replace(' ','_')),'title':page.get('title',''),'engine':'wikimedia','reported_width':ii.get('width'),'reported_height':ii.get('height')})
    return out

def ext_from_url(u, content_type=''):
    path=urllib.parse.urlparse(u).path.lower()
    for ext in ['.jpg','.jpeg','.png','.webp']:
        if path.endswith(ext): return '.jpg' if ext=='.jpeg' else ext
    if 'png' in content_type: return '.png'
    if 'webp' in content_type: return '.webp'
    return '.jpg'

def download_image(cand, dest_base, referer=None):
    u=cand['url']
    req=urllib.request.Request(u, headers={'User-Agent': UA, 'Accept':'image/avif,image/webp,image/apng,image/*,*/*;q=0.8'})
    if referer: req.add_header('Referer', referer)
    try:
        with urllib.request.urlopen(req, timeout=8) as r:
            ctype=r.headers.get('Content-Type','')
            data=r.read()
    except Exception as e:
        return None, f'download_error:{type(e).__name__}:{e}'
    if len(data)<8000:
        return None, 'too_small_bytes'
    sha=hashlib.sha256(data).hexdigest()
    ext=ext_from_url(u, ctype)
    tmp=dest_base.with_suffix(ext)
    tmp.write_bytes(data)
    try:
        im=Image.open(tmp); im.verify()
        im=Image.open(tmp)
        w,h=im.size
    except Exception as e:
        tmp.unlink(missing_ok=True)
        return None, f'invalid_image:{e}'
    if max(w,h) < 450:
        tmp.unlink(missing_ok=True)
        return None, f'too_low_res:{w}x{h}'
    return {'path': str(tmp), 'sha256':sha, 'width':w, 'height':h, 'bytes':len(data)}, None

def has_alias_text(text, aliases):
    t=(text or '').lower()
    return any(a.lower() in t for a in aliases)

def make_contact_sheet(item_dir, records, out_path, title):
    # only downloaded records
    recs=[r for r in records if r.get('path')]
    if not recs: return None
    thumb_w, thumb_h = 220, 180
    label_h=52
    cols=4
    rows=(len(recs)+cols-1)//cols
    W=cols*thumb_w
    H=rows*(thumb_h+label_h)+60
    sheet=Image.new('RGB',(W,H),'white')
    draw=ImageDraw.Draw(sheet)
    try:
        font=ImageFont.truetype('/System/Library/Fonts/PingFang.ttc',14)
        font_s=ImageFont.truetype('/System/Library/Fonts/PingFang.ttc',11)
    except Exception:
        font=font_s=None
    draw.text((10,10),title,fill='black',font=font)
    for idx,r in enumerate(recs):
        x=(idx%cols)*thumb_w; y=50+(idx//cols)*(thumb_h+label_h)
        try:
            im=Image.open(r['path']).convert('RGB')
            im.thumbnail((thumb_w-12,thumb_h-12), Image.LANCZOS)
            px=x+(thumb_w-im.width)//2; py=y+(thumb_h-im.height)//2
            sheet.paste(im,(px,py))
        except Exception:
            pass
        label=f"{idx+1:02d} {r['width']}×{r['height']} {r.get('engine','')}"
        src=(r.get('source') or r.get('title') or '')[:18]
        draw.text((x+6,y+thumb_h+2),label,fill='black',font=font_s)
        draw.text((x+6,y+thumb_h+20),src,fill=(80,80,80),font=font_s)
    sheet.save(out_path, quality=92)
    return str(out_path)

def main():
    only_env = os.environ.get('ONLY_ITEMS','').strip()
    only = {x.strip() for x in only_env.split(',') if x.strip()}
    run_items = [it for it in ITEMS if not only or it['id'] in only or it['name'] in only]
    all_records=[]
    errors=[]
    # carry existing file hashes so reruns do not duplicate within the same package
    seen_sha=set(); seen_url=set()
    for f in IMG_ROOT.glob('*/*'):
        if f.is_file() and f.suffix.lower() in ['.jpg','.jpeg','.png','.webp']:
            try:
                seen_sha.add(hashlib.sha256(f.read_bytes()).hexdigest())
            except Exception:
                pass
    for item in run_items:
        item_dir=IMG_ROOT/item['id']
        item_dir.mkdir(parents=True, exist_ok=True)
        candidates=[]
        for page in OFFICIAL_PAGES.get(item['id'],[]):
            candidates += extract_images_from_page(page)
        # Wikimedia exact and alias
        for q in [item['name']] + item['aliases'][:2]:
            candidates += wikimedia_candidates(q)
            time.sleep(0.4)
        # 360 image candidates
        for q in item['queries']:
            candidates += image_so_candidates(q, pages=1)
        # de-URL; score
        uniq=[]
        for c in candidates:
            u=c.get('url')
            if not u or u in seen_url: continue
            seen_url.add(u)
            text=' '.join([c.get('title',''), c.get('source_page',''), c.get('url',''), c.get('query','')])
            c['match_text']=has_alias_text(text,item['aliases'])
            score=0
            if c.get('engine','').startswith('official'): score+=100
            if c.get('engine')=='wikimedia': score+=50
            if c['match_text']: score+=30
            rw=int(c.get('reported_width') or 0); rh=int(c.get('reported_height') or 0)
            if max(rw,rh)>=1500: score+=15
            if max(rw,rh)>=2500: score+=15
            if any(x in (c.get('url','').lower()+c.get('source_page','').lower()) for x in ['sinaimg.cn','photo.tuchong.com','wikimedia','chnmuseum.cn']): score+=10
            if any(x in (c.get('url','').lower()+c.get('title','').lower()) for x in ['logo','watermark','qrcode','二维码']): score-=80
            c['score']=score
            uniq.append(c)
        uniq.sort(key=lambda x: x.get('score',0), reverse=True)
        (SOURCES/f"{item['id']}_candidates.json").write_text(json.dumps(uniq,ensure_ascii=False,indent=2),encoding='utf-8')
        downloaded=[]
        existing_count = len([f for f in item_dir.iterdir() if f.is_file() and f.suffix.lower() in ['.jpg','.jpeg','.png','.webp']])
        for i,c in enumerate(uniq[:80],1):
            if len(downloaded)>=15: break
            # keep some low-confidence but prefer matched/official/wiki/top
            if i>35 and c.get('score',0)<30: continue
            safe_engine=re.sub(r'\W+','_',c.get('engine','web'))[:12]
            base=item_dir/f"{existing_count+len(downloaded)+1:02d}_{safe_engine}"
            info,err=download_image(c,base,referer=c.get('source_page') or 'https://image.so.com/')
            if err:
                errors.append({'item':item['name'],'url':c.get('url'),'error':err})
                continue
            if info['sha256'] in seen_sha:
                Path(info['path']).unlink(missing_ok=True)
                continue
            seen_sha.add(info['sha256'])
            rec={**c, **info, 'item':item['name'], 'item_id':item['id'], 'file':Path(info['path']).name}
            # rename with dimensions for easier browsing
            new_path=item_dir/f"{existing_count+len(downloaded)+1:02d}_{safe_engine}_{info['width']}x{info['height']}{Path(info['path']).suffix}"
            Path(info['path']).rename(new_path)
            rec['path']=str(new_path); rec['file']=new_path.name
            downloaded.append(rec); all_records.append(rec)
            time.sleep(0.15)
        sheet=make_contact_sheet(item_dir, downloaded, ROOT/'contact_sheets'/f"{item['id']}_缩略图总览.jpg", f"{item['name']} 初筛图片 {len(downloaded)}张")
        print(item['name'], 'downloaded', len(downloaded), 'sheet', sheet)
    # global metadata
    (DOCS/'image_records.json').write_text(json.dumps(all_records,ensure_ascii=False,indent=2),encoding='utf-8')
    (DOCS/'download_errors.json').write_text(json.dumps(errors,ensure_ascii=False,indent=2),encoding='utf-8')
    with (DOCS/'图片清单.tsv').open('w',encoding='utf-8') as f:
        f.write('item\tfile\twidth\theight\tbytes\tengine\ttitle\tsource_page\timage_url\tsha256\n')
        for r in all_records:
            f.write('\t'.join(str(r.get(k,'' )).replace('\t',' ') for k in ['item','file','width','height','bytes','engine','title','source_page','url','sha256'])+'\n')
    print('TOTAL',len(all_records),'records')

if __name__=='__main__':
    main()
