from pathlib import Path
import re, json, math, statistics
from datetime import datetime
import xlrd, openpyxl
from collections import defaultdict
BASE=Path('/Users/bot1/Volumes/root_for_ai/AI工作区/良渚文化_月报_2026年5月_20260601_1123')
SRC=BASE/'source'; WORK=BASE/'work'; DEL=BASE/'deliverables'
WORK.mkdir(exist_ok=True); DEL.mkdir(exist_ok=True)

def norm_header(h):
    if h is None: return ''
    return str(h).strip()

def to_float(v):
    if v is None: return None
    if isinstance(v,(int,float)):
        if isinstance(v,float) and math.isnan(v): return None
        return float(v)
    s=str(v).strip()
    if s in ['', '-', '--', '—', '待定', 'None', 'nan']: return None
    s=s.replace(',','').replace('￥','').replace('¥','').replace('元','').strip()
    pct=False
    if s.endswith('%'):
        pct=True; s=s[:-1]
    try:
        val=float(s)
        return val/100 if pct else val
    except:
        return None

def to_str(v):
    if v is None: return ''
    if isinstance(v,float) and v.is_integer(): return str(int(v))
    return str(v).strip()

def read_xlsx_rows(path):
    # Some JD lowcode XLSX files have incomplete read-only dimensions; normal mode reads them correctly.
    wb=openpyxl.load_workbook(path, data_only=True, read_only=False)
    ws=wb[wb.sheetnames[0]]
    rows=[]
    for row in ws.iter_rows(values_only=True): rows.append(list(row))
    wb.close(); return rows

def read_xls_rows(path):
    book=xlrd.open_workbook(path)
    sh=book.sheet_by_index(0)
    rows=[]
    for r in range(sh.nrows): rows.append([sh.cell_value(r,c) for c in range(sh.ncols)])
    return rows

def read_rows(path):
    return read_xlsx_rows(path) if path.suffix.lower()=='.xlsx' else read_xls_rows(path)

def rows_to_dicts(rows, header_idx):
    headers=[norm_header(h) for h in rows[header_idx]]
    out=[]
    for row in rows[header_idx+1:]:
        if all(x is None or str(x).strip()=='' for x in row): continue
        d={headers[i]: row[i] if i < len(row) else None for i in range(len(headers)) if headers[i]}
        out.append(d)
    return out, headers

def find_file(substrs):
    for p in SRC.iterdir():
        name=p.name
        if all(s in name for s in substrs): return p
    raise FileNotFoundError(substrs)

def period_month(s):
    ss=to_str(s)
    m=re.search(r'(20\d{2})[-年/]?(\d{1,2})', ss)
    if m: return f"{int(m.group(1)):04d}-{int(m.group(2)):02d}"
    return ''

# Store monthly summaries
store=[]
# Tmall & Taobao monthly xlsx
for platform, substrs in [('天猫',['天猫分月']),('淘宝',['淘宝分月'])]:
    p=find_file(substrs)
    rows=read_rows(p); data, headers=rows_to_dicts(rows,0)
    for d in data:
        month=period_month(d.get('统计日期'))
        if not month: continue
        pay=to_float(d.get('支付金额'))
        refund=to_float(d.get('成功退款金额') or d.get('退款金额') or d.get('退款金额'))
        buyers=to_float(d.get('支付买家数'))
        uv=to_float(d.get('访客数'))
        conv=to_float(d.get('支付转化率'))
        if conv is None and uv and buyers is not None: conv=buyers/uv
        store.append({'平台':platform,'月份':month,'期间':to_str(d.get('统计日期')),'店铺名称':to_str(d.get('店铺名称')),
                      '访客数':uv,'浏览量':to_float(d.get('浏览量')),'支付金额':pay,'成交金额':pay,
                      '退款金额':refund,'实际成交金额':pay-refund if pay is not None and refund is not None else pay,
                      '支付买家数':buyers,'支付件数':to_float(d.get('支付件数')),'转化率':conv,
                      '客单价':to_float(d.get('客单价')) or (pay/buyers if pay and buyers else None),
                      '加购人数':to_float(d.get('加购人数')),'收藏次数':to_float(d.get('商品收藏次数')),
                      '来源文件':p.name})
# JD monthly xls
p=find_file(['京东月报'])
rows=read_rows(p); data, headers=rows_to_dicts(rows,0)
for d in data:
    month=period_month(d.get('日期'))
    pay=to_float(d.get('成交金额-全部渠道'))
    uv=to_float(d.get('访客数-全部渠道'))
    buyers=to_float(d.get('成交客户数-全部渠道'))
    conv=to_float(d.get('成交转化率-全部渠道')) or (buyers/uv if buyers is not None and uv else None)
    store.append({'平台':'京东','月份':month,'期间':to_str(d.get('日期')),'店铺名称':'良渚文化京东店',
                  '访客数':uv,'浏览量':to_float(d.get('浏览量-全部渠道')),'支付金额':pay,'成交金额':pay,
                  '退款金额':None,'实际成交金额':pay,'支付买家数':buyers,'支付件数':None,'转化率':conv,
                  '客单价':to_float(d.get('客单价-全部渠道')) or (pay/buyers if pay and buyers else None),
                  '加购人数':None,'收藏次数':None,'来源文件':p.name})

# JD trade summary lowcode export (`6.har`) is the preferred 2026-05 refund/net-sales source.
# It contains store-level refund/after-sales fields that are more reliable for monthly net-sales口径
# than summing product-detail rows. Override/complete the matching store month when present.
for p in SRC.glob('*京东*交易概况*.xlsx'):
    rows=read_rows(p)
    if len(rows) < 2:
        continue
    headers=[norm_header(x) for x in rows[0]]
    values=rows[1]
    d={headers[i]: values[i] if i < len(values) else None for i in range(len(headers)) if headers[i]}
    month=period_month(d.get('时间'))
    if not month:
        continue
    pay=to_float(d.get('成交金额'))
    refund=to_float(d.get('退款金额'))
    uv=to_float(d.get('店铺访客数'))
    pv=to_float(d.get('店铺浏览量'))
    buyers=to_float(d.get('成交客户数'))
    units=to_float(d.get('成交商品件数'))
    conv=to_float(d.get('店铺成交转化率')) or (buyers/uv if buyers is not None and uv else None)
    base={'平台':'京东','月份':month,'期间':to_str(d.get('时间')),'店铺名称':'良渚文化京东店',
          '访客数':uv,'浏览量':pv,'支付金额':pay,'成交金额':pay,
          '退款金额':refund,'实际成交金额':pay-refund if pay is not None and refund is not None else pay,
          '支付买家数':buyers,'支付件数':units,'转化率':conv,
          '客单价':to_float(d.get('客单价')) or (pay/buyers if pay and buyers else None),
          '加购人数':to_float(d.get('加购客户数')),'收藏次数':None,
          '退款商品件数':to_float(d.get('退款商品件数')),'退款单量':to_float(d.get('退款单量')),
          '来源文件':p.name}
    replaced=False
    for i,row in enumerate(store):
        if row.get('平台')=='京东' and row.get('月份')==month:
            store[i]={**row, **{k:v for k,v in base.items() if v is not None or k in ['退款金额','实际成交金额','来源文件']}}
            replaced=True
            break
    if not replaced:
        store.append(base)

# Product summaries Tmall/Taobao monthly product all. Header row 4 (0-based), one summary row per product.
products=[]
for p in SRC.iterdir():
    name=p.name
    if ('商品_全部' in name) and (('【天猫】' in name) or ('【淘宝】' in name)):
        platform='天猫' if '【天猫】' in name else '淘宝'
        # parse month from filename
        m=re.search(r'(2026-\d{2})-\d{2}_[0-9]{4}-\d{2}-\d{2}', name)
        month=m.group(1) if m else ''
        rows=read_rows(p); data, headers=rows_to_dicts(rows,4)
        for d in data:
            pid=to_str(d.get('商品ID'))
            if not pid or pid=='商品ID' or pid=='合计': continue
            pay=to_float(d.get('支付金额'))
            refund=to_float(d.get('成功退款金额'))
            uv=to_float(d.get('商品访客数'))
            buyers=to_float(d.get('支付买家数'))
            units=to_float(d.get('支付件数'))
            conv=to_float(d.get('商品支付转化率') or d.get('支付转化率')) or (buyers/uv if buyers is not None and uv else None)
            products.append({'平台':platform,'月份':month,'商品ID':pid,'商品名称':to_str(d.get('商品名称')),'货号':to_str(d.get('货号')),
                             '访客数':uv,'浏览量':to_float(d.get('商品浏览量')),'收藏人数':to_float(d.get('商品收藏人数')),
                             '加购件数':to_float(d.get('商品加购件数')),'加购人数':to_float(d.get('商品加购人数')),
                             '支付买家数':buyers,'支付件数':units,'支付金额':pay,'成交金额':pay,
                             '退款金额':refund,'实际成交金额':pay-refund if pay is not None and refund is not None else pay,
                             '转化率':conv,'来源文件':name})
# JD product monthly: use monthly summary period rows only (time contains ~), exclude daily rows if any and exclude 合计 for top list but capture total.
for p in SRC.iterdir():
    name=p.name
    if '京东_商品明细' in name:
        month=period_month(name)
        rows=read_rows(p); data, headers=rows_to_dicts(rows,0)
        for d in data:
            t=to_str(d.get('时间'))
            if '~' not in t: continue
            spu=to_str(d.get('SPU'))
            if not spu or spu=='SPU' or spu=='合计': continue
            pay=to_float(d.get('成交金额'))
            refund=to_float(d.get('取消及售后退款金额'))
            uv=to_float(d.get('商品访客数'))
            buyers=to_float(d.get('成交客户数'))
            units=to_float(d.get('成交商品件数'))
            conv=to_float(d.get('成交转化率')) or (buyers/uv if buyers is not None and uv else None)
            products.append({'平台':'京东','月份':month,'商品ID':spu,'商品名称':to_str(d.get('SPU名称')),'货号':to_str(d.get('货号')),
                             '访客数':uv,'浏览量':to_float(d.get('商品浏览量')),'收藏人数':None,
                             '加购件数':to_float(d.get('加购商品件数')),'加购人数':to_float(d.get('加购客户数')),
                             '支付买家数':buyers,'支付件数':units,'支付金额':pay,'成交金额':pay,
                             '退款金额':refund,'实际成交金额':pay-refund if pay is not None and refund is not None else pay,
                             '转化率':conv,'来源文件':name})

# Promotion: only plan report as total口径, aggregate by platform/month/scenario.
promos=[]
for platform, key in [('天猫','【天猫】计划报表'),('淘宝','【淘宝】计划报表')]:
    p=next((x for x in SRC.iterdir() if key in x.name), None)
    if not p: continue
    rows=read_rows(p); data, headers=rows_to_dicts(rows,0)
    for d in data:
        dt=to_str(d.get('日期'))
        month=period_month(dt)
        if not month: continue
        spend=to_float(d.get('花费')) or 0.0
        gmv=to_float(d.get('总成交金额'))
        net=to_float(d.get('净成交金额'))
        orders=to_float(d.get('总成交笔数'))
        clicks=to_float(d.get('点击量')) or 0.0
        imps=to_float(d.get('展现量')) or 0.0
        addcart=to_float(d.get('总购物车数') or d.get('总收藏加购数'))
        promos.append({'平台':platform,'月份':month,'场景名字':to_str(d.get('场景名字')),'计划名字':to_str(d.get('计划名字')),
                       '花费':spend,'总成交金额':gmv or 0.0,'净成交金额':net,'成交笔数':orders,'点击量':clicks,'展现量':imps,
                       '收藏加购':addcart,'来源文件':p.name})

# Additional dimension: paid scenario report by scene for May, use for breakdown only not total if needed.
# New product keyword flag
new_kw=[('玉鸟咕咕毛绒挂件',['玉鸟咕咕','毛绒','公仔']),('礼乐粽子',['粽子','礼乐']),('黑陶茶具',['黑陶','茶具'])]
def new_category(name):
    n=name or ''
    cats=[]
    # specific logic: require key term for category, not all words
    if ('玉鸟咕咕' in n and ('毛绒' in n or '公仔' in n or '挂件' in n)): cats.append('玉鸟咕咕毛绒挂件')
    if '粽子' in n or '礼乐' in n: cats.append('礼乐粽子')
    if '黑陶' in n and ('茶具' in n or '茶杯' in n or '酒具' in n or '礼盒' in n): cats.append('黑陶茶具')
    return '、'.join(cats)
for r in products:
    r['新品类别']=new_category(r['商品名称'])

# Write cleaned JSON
for filename, data in [('store_monthly.json',store),('product_monthly.json',products),('promo_plan_rows.json',promos)]:
    (WORK/filename).write_text(json.dumps(data,ensure_ascii=False,indent=2),encoding='utf-8')

# Aggregate helpers
def agg(rows, keys, sums):
    d={}
    for r in rows:
        k=tuple(r.get(x) for x in keys)
        if k not in d:
            d[k]={x:r.get(x) for x in keys}
            for s in sums: d[k][s]=0.0
        for s in sums:
            v=r.get(s)
            if v is not None: d[k][s]+=float(v)
    return list(d.values())

# summary tables for May and April
store_may=[r for r in store if r['月份']=='2026-05']
store_apr=[r for r in store if r['月份']=='2026-04']
prod_may=[r for r in products if r['月份']=='2026-05']
prod_apr=[r for r in products if r['月份']=='2026-04']
promo_may=agg([r for r in promos if r['月份']=='2026-05'], ['平台','月份','场景名字'], ['花费','总成交金额','成交笔数','点击量','展现量'])
promo_apr=agg([r for r in promos if r['月份']=='2026-04'], ['平台','月份','场景名字'], ['花费','总成交金额','成交笔数','点击量','展现量'])
for r in promo_may+promo_apr:
    r['ROI']=r['总成交金额']/r['花费'] if r['花费'] else None
    r['点击率']=r['点击量']/r['展现量'] if r['展现量'] else None
    r['CPC']=r['花费']/r['点击量'] if r['点击量'] else None

summary={'store_may':store_may,'store_apr':store_apr,'promo_may':promo_may,'promo_apr':promo_apr,
         'counts':{'store':len(store),'products':len(products),'promos':len(promos)}}
(WORK/'analysis_summary.json').write_text(json.dumps(summary,ensure_ascii=False,indent=2),encoding='utf-8')
print(json.dumps(summary['counts'],ensure_ascii=False))
print('STORE MAY')
for r in store_may: print(r)
print('PROMO MAY')
for r in promo_may: print(r)
print('MISSING May platforms in store:', set(['天猫','淘宝','京东'])-set(r['平台'] for r in store_may))
# Print Top 10 JD May/Apr
for mon in ['2026-05','2026-04']:
    top=sorted([r for r in products if r['平台']=='京东' and r['月份']==mon], key=lambda x:x.get('实际成交金额') or 0, reverse=True)[:10]
    print('JD TOP',mon)
    total=sum((r.get('实际成交金额') or 0) for r in [x for x in products if x['平台']=='京东' and x['月份']==mon])
    for r in top:
        print(r['商品名称'][:40], r['实际成交金额'], r['支付件数'], '占比', (r['实际成交金额'] or 0)/total if total else None)