#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os, re, json, time, hashlib, urllib.parse, urllib.request, shutil, ssl from pathlib import Path from PIL import Image, ImageOps, ImageDraw, ImageFont, UnidentifiedImageError ROOT = Path('/Users/bot1/Volumes/root_for_ai/AI工作区/国博_图片素材_馆藏八大件_20260612_1857') IMG_ROOT = ROOT/'images' CACHE = ROOT/'cache' DOCS = ROOT/'docs' SOURCES = ROOT/'sources' for p in [IMG_ROOT, CACHE, DOCS, SOURCES, ROOT/'contact_sheets', ROOT/'deliverables']: p.mkdir(parents=True, exist_ok=True) ITEMS = [ {"id":"01_陶鹰鼎", "name":"陶鹰鼎", "aliases":["陶鹰鼎","陶鹰","鹰鼎"], "queries":["陶鹰鼎中国国家博物馆高清", "陶鹰鼎高清大图", "国博陶鹰鼎", "陶鹰鼎侧面细节"]}, {"id":"02_后母戊鼎", "name":"后母戊鼎", "aliases":["后母戊鼎","司母戊鼎","后母戊","司母戊"], "queries":["后母戊鼎中国国家博物馆高清", "司母戊鼎高清大图", "国博后母戊鼎", "后母戊鼎侧面细节"]}, {"id":"03_青铜冰鉴", "name":"青铜冰鉴", "aliases":["青铜冰鉴","冰鉴"], "queries":["青铜冰鉴中国国家博物馆高清", "青铜冰鉴高清大图", "国博青铜冰鉴", "青铜冰鉴细节"]}, {"id":"04_击鼓说唱俑", "name":"击鼓说唱俑", "aliases":["击鼓说唱俑","说唱俑","击鼓俑"], "queries":["击鼓说唱俑中国国家博物馆高清", "击鼓说唱俑高清大图", "国博击鼓说唱俑", "击鼓说唱俑侧面细节"]}, {"id":"05_青瓷莲花尊", "name":"青瓷莲花尊", "aliases":["青瓷莲花尊","莲花尊"], "queries":["青瓷莲花尊中国国家博物馆高清", "青瓷莲花尊高清大图", "国博青瓷莲花尊", "青瓷莲花尊细节"]}, {"id":"06_载乐骆驼", "name":"载乐骆驼", "aliases":["载乐骆驼","三彩载乐骆驼","骆驼载乐俑","唐三彩载乐骆驼"], "queries":["载乐骆驼中国国家博物馆高清", "三彩载乐骆驼高清大图", "国博载乐骆驼", "载乐骆驼侧面细节"]}, {"id":"07_绿釉鸱吻", "name":"绿釉鸱吻", "aliases":["绿釉鸱吻","鸱吻"], "queries":["绿釉鸱吻中国国家博物馆高清", "绿釉鸱吻高清大图", "国博绿釉鸱吻", "绿釉鸱吻细节"]}, {"id":"08_海晏河清尊", "name":"海晏河清尊", "aliases":["海晏河清尊","海晏河清"], "queries":["海晏河清尊中国国家博物馆高清", "海晏河清尊高清大图", "国博海晏河清尊", "海晏河清尊细节"]}, ] OFFICIAL_PAGES = { "02_后母戊鼎": [ "https://www.chnmuseum.cn/Portals/0/web/zt/100n/guobao_content-4.html?id=26", "https://www.chnmuseum.cn/sp/gbzp/rggbhsh/202203/t20220303_254156.shtml", ], "03_青铜冰鉴": ["https://www.chnmuseum.cn/zp/zpml/kgfjp/202008/t20200824_247219.shtml"], "05_青瓷莲花尊": ["https://www.chnmuseum.cn/Portals/0/web/zt/100n/guobao_content-7.html?id=29"], } UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/125 Safari/537.36' def fetch(url, timeout=25, binary=False, referer=None): req = urllib.request.Request(url, headers={'User-Agent': UA, 'Accept': '*/*'}) if referer: req.add_header('Referer', referer) try: with urllib.request.urlopen(req, timeout=timeout) as r: data = r.read() return data if binary else data.decode('utf-8','ignore') except Exception as e: return None def norm_url(u, base=None): if not u: return None u = u.replace('\\', '/') u = u.strip() if base: u = urllib.parse.urljoin(base, u) return u def extract_images_from_page(url): txt = fetch(url, referer='https://www.chnmuseum.cn/') out=[] if not txt: return out (SOURCES/'official_pages_raw.txt').open('a', encoding='utf-8').write(f"\n\n===== {url} =====\n"+txt[:5000]) title = '' m = re.search(r']*>(.*?)', txt, re.I|re.S) if m: title = re.sub('<.*?>','',m.group(1)).strip() for m in re.finditer(r'(?:src|href|data-src|zoomfile|file)=["\']([^"\']+?\.(?:jpg|jpeg|png|webp)(?:\?[^"\']*)?)["\']', txt, re.I): im = norm_url(m.group(1), url) if not im: continue low=im.lower() if any(x in low for x in ['logo','ewm','icon','header','footer','search','download','blue.png','close.png','wx','wb']): continue if im not in [x['url'] for x in out]: out.append({'url': im, 'source_page': url, 'title': title, 'engine':'official_page'}) # background:url(...) for m in re.finditer(r'url\(([^\)]+?\.(?:jpg|jpeg|png|webp))\)', txt, re.I): im = norm_url(m.group(1).strip('"\''), url) if im and im not in [x['url'] for x in out]: out.append({'url': im, 'source_page': url, 'title': title, 'engine':'official_bg'}) return out def image_so_candidates(query, pages=3): out=[] sid='' for page in range(pages): sn=page*30 url='https://image.so.com/j?' + urllib.parse.urlencode({ 'q': query, 'pd':'1','pn':'30','correct':query,'adstar':'0','tab':'all','sid':sid,'ras':'0','cn':'0','gn':'0','kn':'50','crn':'0','bxn':'20','cuben':'0','pornn':'0','manun':'50','src':'srp','sn':str(sn) }) txt=fetch(url, referer='https://image.so.com/') if not txt: continue try: data=json.loads(txt) except Exception: continue sid=data.get('sid') or sid for d in data.get('list',[]): im=d.get('img') or d.get('thumb') or d.get('qhimg_url') if not im: continue out.append({ 'url': im, 'source_page': d.get('link') or d.get('sourceurl') or '', 'title': d.get('title') or d.get('grpmd5') or '', 'engine': 'image.so', 'query': query, 'reported_width': d.get('width'), 'reported_height': d.get('height'), 'source': d.get('src') or d.get('site') or '' }) time.sleep(0.5) return out def wikimedia_candidates(query): out=[] api='https://commons.wikimedia.org/w/api.php' params={ 'action':'query','generator':'search','gsrsearch':query,'gsrnamespace':'6','gsrlimit':'20', 'prop':'imageinfo','iiprop':'url|size|mime','format':'json','origin':'*' } txt=fetch(api+'?'+urllib.parse.urlencode(params), referer='https://commons.wikimedia.org/') if not txt: return out try: data=json.loads(txt) except Exception: return out for page in (data.get('query',{}).get('pages',{}) or {}).values(): ii=(page.get('imageinfo') or [{}])[0] u=ii.get('url') if u: out.append({'url':u,'source_page':'https://commons.wikimedia.org/wiki/'+urllib.parse.quote(page.get('title','').replace(' ','_')),'title':page.get('title',''),'engine':'wikimedia','reported_width':ii.get('width'),'reported_height':ii.get('height')}) return out def ext_from_url(u, content_type=''): path=urllib.parse.urlparse(u).path.lower() for ext in ['.jpg','.jpeg','.png','.webp']: if path.endswith(ext): return '.jpg' if ext=='.jpeg' else ext if 'png' in content_type: return '.png' if 'webp' in content_type: return '.webp' return '.jpg' def download_image(cand, dest_base, referer=None): u=cand['url'] req=urllib.request.Request(u, headers={'User-Agent': UA, 'Accept':'image/avif,image/webp,image/apng,image/*,*/*;q=0.8'}) if referer: req.add_header('Referer', referer) try: with urllib.request.urlopen(req, timeout=8) as r: ctype=r.headers.get('Content-Type','') data=r.read() except Exception as e: return None, f'download_error:{type(e).__name__}:{e}' if len(data)<8000: return None, 'too_small_bytes' sha=hashlib.sha256(data).hexdigest() ext=ext_from_url(u, ctype) tmp=dest_base.with_suffix(ext) tmp.write_bytes(data) try: im=Image.open(tmp); im.verify() im=Image.open(tmp) w,h=im.size except Exception as e: tmp.unlink(missing_ok=True) return None, f'invalid_image:{e}' if max(w,h) < 450: tmp.unlink(missing_ok=True) return None, f'too_low_res:{w}x{h}' return {'path': str(tmp), 'sha256':sha, 'width':w, 'height':h, 'bytes':len(data)}, None def has_alias_text(text, aliases): t=(text or '').lower() return any(a.lower() in t for a in aliases) def make_contact_sheet(item_dir, records, out_path, title): # only downloaded records recs=[r for r in records if r.get('path')] if not recs: return None thumb_w, thumb_h = 220, 180 label_h=52 cols=4 rows=(len(recs)+cols-1)//cols W=cols*thumb_w H=rows*(thumb_h+label_h)+60 sheet=Image.new('RGB',(W,H),'white') draw=ImageDraw.Draw(sheet) try: font=ImageFont.truetype('/System/Library/Fonts/PingFang.ttc',14) font_s=ImageFont.truetype('/System/Library/Fonts/PingFang.ttc',11) except Exception: font=font_s=None draw.text((10,10),title,fill='black',font=font) for idx,r in enumerate(recs): x=(idx%cols)*thumb_w; y=50+(idx//cols)*(thumb_h+label_h) try: im=Image.open(r['path']).convert('RGB') im.thumbnail((thumb_w-12,thumb_h-12), Image.LANCZOS) px=x+(thumb_w-im.width)//2; py=y+(thumb_h-im.height)//2 sheet.paste(im,(px,py)) except Exception: pass label=f"{idx+1:02d} {r['width']}×{r['height']} {r.get('engine','')}" src=(r.get('source') or r.get('title') or '')[:18] draw.text((x+6,y+thumb_h+2),label,fill='black',font=font_s) draw.text((x+6,y+thumb_h+20),src,fill=(80,80,80),font=font_s) sheet.save(out_path, quality=92) return str(out_path) def main(): only_env = os.environ.get('ONLY_ITEMS','').strip() only = {x.strip() for x in only_env.split(',') if x.strip()} run_items = [it for it in ITEMS if not only or it['id'] in only or it['name'] in only] all_records=[] errors=[] # carry existing file hashes so reruns do not duplicate within the same package seen_sha=set(); seen_url=set() for f in IMG_ROOT.glob('*/*'): if f.is_file() and f.suffix.lower() in ['.jpg','.jpeg','.png','.webp']: try: seen_sha.add(hashlib.sha256(f.read_bytes()).hexdigest()) except Exception: pass for item in run_items: item_dir=IMG_ROOT/item['id'] item_dir.mkdir(parents=True, exist_ok=True) candidates=[] for page in OFFICIAL_PAGES.get(item['id'],[]): candidates += extract_images_from_page(page) # Wikimedia exact and alias for q in [item['name']] + item['aliases'][:2]: candidates += wikimedia_candidates(q) time.sleep(0.4) # 360 image candidates for q in item['queries']: candidates += image_so_candidates(q, pages=1) # de-URL; score uniq=[] for c in candidates: u=c.get('url') if not u or u in seen_url: continue seen_url.add(u) text=' '.join([c.get('title',''), c.get('source_page',''), c.get('url',''), c.get('query','')]) c['match_text']=has_alias_text(text,item['aliases']) score=0 if c.get('engine','').startswith('official'): score+=100 if c.get('engine')=='wikimedia': score+=50 if c['match_text']: score+=30 rw=int(c.get('reported_width') or 0); rh=int(c.get('reported_height') or 0) if max(rw,rh)>=1500: score+=15 if max(rw,rh)>=2500: score+=15 if any(x in (c.get('url','').lower()+c.get('source_page','').lower()) for x in ['sinaimg.cn','photo.tuchong.com','wikimedia','chnmuseum.cn']): score+=10 if any(x in (c.get('url','').lower()+c.get('title','').lower()) for x in ['logo','watermark','qrcode','二维码']): score-=80 c['score']=score uniq.append(c) uniq.sort(key=lambda x: x.get('score',0), reverse=True) (SOURCES/f"{item['id']}_candidates.json").write_text(json.dumps(uniq,ensure_ascii=False,indent=2),encoding='utf-8') downloaded=[] existing_count = len([f for f in item_dir.iterdir() if f.is_file() and f.suffix.lower() in ['.jpg','.jpeg','.png','.webp']]) for i,c in enumerate(uniq[:80],1): if len(downloaded)>=15: break # keep some low-confidence but prefer matched/official/wiki/top if i>35 and c.get('score',0)<30: continue safe_engine=re.sub(r'\W+','_',c.get('engine','web'))[:12] base=item_dir/f"{existing_count+len(downloaded)+1:02d}_{safe_engine}" info,err=download_image(c,base,referer=c.get('source_page') or 'https://image.so.com/') if err: errors.append({'item':item['name'],'url':c.get('url'),'error':err}) continue if info['sha256'] in seen_sha: Path(info['path']).unlink(missing_ok=True) continue seen_sha.add(info['sha256']) rec={**c, **info, 'item':item['name'], 'item_id':item['id'], 'file':Path(info['path']).name} # rename with dimensions for easier browsing new_path=item_dir/f"{existing_count+len(downloaded)+1:02d}_{safe_engine}_{info['width']}x{info['height']}{Path(info['path']).suffix}" Path(info['path']).rename(new_path) rec['path']=str(new_path); rec['file']=new_path.name downloaded.append(rec); all_records.append(rec) time.sleep(0.15) sheet=make_contact_sheet(item_dir, downloaded, ROOT/'contact_sheets'/f"{item['id']}_缩略图总览.jpg", f"{item['name']} 初筛图片 {len(downloaded)}张") print(item['name'], 'downloaded', len(downloaded), 'sheet', sheet) # global metadata (DOCS/'image_records.json').write_text(json.dumps(all_records,ensure_ascii=False,indent=2),encoding='utf-8') (DOCS/'download_errors.json').write_text(json.dumps(errors,ensure_ascii=False,indent=2),encoding='utf-8') with (DOCS/'图片清单.tsv').open('w',encoding='utf-8') as f: f.write('item\tfile\twidth\theight\tbytes\tengine\ttitle\tsource_page\timage_url\tsha256\n') for r in all_records: f.write('\t'.join(str(r.get(k,'' )).replace('\t',' ') for k in ['item','file','width','height','bytes','engine','title','source_page','url','sha256'])+'\n') print('TOTAL',len(all_records),'records') if __name__=='__main__': main()