import argparse
import datetime as dt
import hashlib
import json
import os
import re
import sys
import time
import urllib.parse
import urllib.request
import zipfile
from pathlib import Path

SECRET_DIR_DEFAULT = Path('/Users/bot1/.hermes/profiles/it/secrets/alimama-exporter')
BASE_URL = 'https://one.alimama.com'
COMMONAPI_BASE_URL = 'https://bpcommon.alimama.com'
SENSITIVE_KEYS = {'cookie', 'authorization', 'csrfid', 'csrf', 'token', '_tb_token_', 'loginpointid', 'sign'}
SAFE_HEADER_NAMES = {'accept', 'content-type', 'origin', 'referer', 'user-agent', 'x-requested-with'}
REPORT_PRIORITY = ['scene', 'campaign', 'bidword', 'item_promotion', 'crowd', 'creative', 'area']
REPORT_LABELS = {
    'scene': '营销场景报表',
    'campaign': '计划报表',
    'bidword': '关键词报表',
    'item_promotion': '商品报表',
    'crowd': '人群报表',
    'creative': '创意报表',
    'area': '地域报表',
}


def eprint(*args):
    print(*args, file=sys.stderr)


def load_json(path):
    return json.loads(Path(path).read_text(encoding='utf-8', errors='replace'))


def write_json(path, data, secret=False):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8')
    if secret:
        os.chmod(path, 0o600)
    return path


def header_map(headers):
    return {h.get('name', '').lower(): h.get('value', '') for h in headers or [] if h.get('name')}


def parse_cookie_header(cookie_header):
    out = {}
    for part in (cookie_header or '').split(';'):
        if '=' not in part:
            continue
        k, v = part.split('=', 1)
        k = k.strip()
        if k:
            out[k] = v.strip()
    return out


def cookie_compare(a_header, b_header):
    a = parse_cookie_header(a_header)
    b = parse_cookie_header(b_header)
    common = sorted(set(a) & set(b))
    equal = [k for k in common if a.get(k) == b.get(k)]
    return {
        'a_cookie_count': len(a),
        'b_cookie_count': len(b),
        'common_cookie_names_count': len(common),
        'equal_name_value_count': len(equal),
        'identical_cookie_header': a_header == b_header,
        'common_cookie_names': common,
        'equal_cookie_names': equal,
    }


def read_cookie_json(path):
    data = load_json(path)
    if isinstance(data, dict):
        return data.get('Cookie') or data.get('cookie') or data.get('headers', {}).get('Cookie') or ''
    return ''


def safe_payload(payload):
    if isinstance(payload, dict):
        return {k: ('[REDACTED]' if k.lower() in SENSITIVE_KEYS else safe_payload(v)) for k, v in payload.items()}
    if isinstance(payload, list):
        return [safe_payload(x) for x in payload]
    return payload


def secretless_payload(payload):
    if not isinstance(payload, dict):
        return payload
    return {k: v for k, v in payload.items() if k.lower() not in SENSITIVE_KEYS}


def json_post_text(entry):
    text = ((entry.get('request') or {}).get('postData') or {}).get('text') or ''
    if not text:
        return None
    try:
        return json.loads(text)
    except Exception:
        return None


def har_entries(path):
    return load_json(path).get('log', {}).get('entries', [])


def request_url_parts(entry):
    url = entry.get('request', {}).get('url', '')
    u = urllib.parse.urlparse(url)
    return url, u.netloc, u.path


def find_best_request(entries):
    preferred = []
    fallback = []
    for entry in entries:
        url, host, path = request_url_parts(entry)
        if not host.endswith('one.alimama.com'):
            continue
        hm = header_map(entry.get('request', {}).get('headers'))
        if not hm.get('cookie'):
            continue
        payload = json_post_text(entry) or {}
        score = 0
        if '/report/createDownLoadTask.json' in path:
            score += 100
        if path.startswith('/report/'):
            score += 40
        if 'csrfId' in payload:
            score += 10
        if 'loginPointId' in payload:
            score += 10
        item = (score, entry)
        if score:
            preferred.append(item)
        else:
            fallback.append(item)
    if preferred:
        return sorted(preferred, key=lambda x: x[0], reverse=True)[0][1]
    if fallback:
        return fallback[0][1]
    return None


def find_first_url(entries, path_suffix):
    for entry in entries:
        url, host, path = request_url_parts(entry)
        if host.endswith('one.alimama.com') and path == path_suffix:
            return url
    return BASE_URL + path_suffix


def infer_report_key(payload):
    rpt = str(payload.get('rptType') or '')
    qd = payload.get('queryDomains') or []
    excel = str(payload.get('excelName') or '')
    if rpt == 'account' and 'scene' in qd:
        return 'scene'
    if rpt == 'campaign':
        return 'campaign'
    if rpt == 'bidword':
        return 'bidword'
    if rpt == 'item_promotion' or '商品报表' in excel:
        return 'item_promotion'
    if rpt == 'crowd':
        return 'crowd'
    if rpt == 'creative':
        return 'creative'
    if rpt == 'area':
        return 'area'
    if '营销场景' in excel:
        return 'scene'
    if '计划' in excel:
        return 'campaign'
    if '关键词' in excel:
        return 'bidword'
    return rpt or 'unknown'


def collect_download_templates(har_path):
    templates = []
    seen = set()
    for idx, entry in enumerate(har_entries(har_path)):
        url, host, path = request_url_parts(entry)
        if not host.endswith('one.alimama.com') or path != '/report/createDownLoadTask.json':
            continue
        payload = json_post_text(entry)
        if not isinstance(payload, dict):
            continue
        report_key = infer_report_key(payload)
        nonsecret = secretless_payload(payload)
        key = (report_key, json.dumps(nonsecret, ensure_ascii=False, sort_keys=True))
        if key in seen:
            continue
        seen.add(key)
        templates.append({
            'report_key': report_key,
            'report_label': REPORT_LABELS.get(report_key, payload.get('excelName') or report_key),
            'har_entry_index': idx,
            'endpoint_path': '/report/createDownLoadTask.json',
            'payload': nonsecret,
            'payload_preview': safe_payload(payload),
            'original_excel_name': payload.get('excelName'),
            'rptType': payload.get('rptType'),
            'queryDomains': payload.get('queryDomains'),
            'startTime': payload.get('startTime'),
            'endTime': payload.get('endTime'),
            'splitType': payload.get('splitType'),
        })
    templates.sort(key=lambda x: REPORT_PRIORITY.index(x['report_key']) if x['report_key'] in REPORT_PRIORITY else 99)
    return templates


def month_range(month):
    y, m = map(int, month.split('-', 1))
    start = dt.date(y, m, 1)
    if m == 12:
        end = dt.date(y + 1, 1, 1) - dt.timedelta(days=1)
    else:
        end = dt.date(y, m + 1, 1) - dt.timedelta(days=1)
    return start.isoformat(), end.isoformat()


def sanitize_filename(name):
    name = re.sub(r'[\\/:*?"<>|\s]+', '_', str(name)).strip('_')
    return name or 'alimama_report'


def load_secret_bundle(store, secret_dir):
    secret_dir = Path(secret_dir)
    cookie_path = secret_dir / f'{store}_cookie.json'
    headers_path = secret_dir / f'{store}_headers.json'
    session_path = secret_dir / f'{store}_session.json'
    if not cookie_path.exists() or not headers_path.exists() or not session_path.exists():
        raise SystemExit(f'MISSING credentials for store={store}; run extract-credentials first')
    cookie = read_cookie_json(cookie_path)
    headers = load_json(headers_path)
    session = load_json(session_path)
    h = {k: v for k, v in headers.items() if v}
    h['Cookie'] = cookie
    return h, session


def open_json_request(url, method='GET', headers=None, payload=None, timeout=30):
    headers = headers or {}
    data = None
    if payload is not None:
        data = json.dumps(payload, ensure_ascii=False, separators=(',', ':')).encode('utf-8')
        headers = dict(headers)
        headers.setdefault('Content-Type', 'application/json;charset=UTF-8')
    req = urllib.request.Request(url, data=data, headers=headers, method=method)
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        body = resp.read()
        ctype = resp.headers.get('Content-Type', '')
        text = body.decode('utf-8', errors='replace')
        try:
            return json.loads(text), resp.headers
        except Exception:
            raise RuntimeError(f'non-json response status={resp.status} ctype={ctype} preview={text[:200]!r}')


def extract_download_url(obj):
    if isinstance(obj, dict):
        for key in ['downloadUrl', 'url', 'fileUrl', 'ossUrl']:
            val = obj.get(key)
            if isinstance(val, str) and val.startswith(('http://', 'https://')):
                return val
        for val in obj.values():
            found = extract_download_url(val)
            if found:
                return found
    elif isinstance(obj, list):
        for val in obj:
            found = extract_download_url(val)
            if found:
                return found
    return None


def create_task(template, headers, session, month, dry_run=False):
    start, end = month_range(month)
    payload = dict(template['payload'])
    payload['startTime'] = start
    payload['endTime'] = end
    payload['splitType'] = 'month'
    # bidword in the captured HAR may be weekly; force month for monthly report.
    if template['report_key'] == 'bidword':
        payload['splitType'] = 'month'
    payload['csrfId'] = session.get('csrfId')
    payload['loginPointId'] = session.get('loginPointId')
    payload.setdefault('bizCode', session.get('bizCode') or 'universalBP')
    payload['source'] = payload.get('source') or 'async_dowdload'
    label = REPORT_LABELS.get(template['report_key'], template['report_key'])
    payload['excelName'] = f"{label}_{month.replace('-', '')}_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}"
    url = session.get('create_url') or (BASE_URL + '/report/createDownLoadTask.json')
    if dry_run:
        return {'dry_run': True, 'report_key': template['report_key'], 'excelName': payload['excelName'], 'payload_preview': safe_payload(payload)}
    data, _ = open_json_request(url, method='POST', headers=headers, payload=payload, timeout=60)
    task_id = (((data or {}).get('data') or {}).get('taskId'))
    ok = (((data or {}).get('info') or {}).get('ok'))
    if not task_id:
        raise RuntimeError(f'create task failed for {template["report_key"]}: ok={ok} response={safe_payload(data)}')
    return {'taskId': task_id, 'report_key': template['report_key'], 'excelName': payload['excelName'], 'response_ok': ok}


def poll_download_url(task_id, headers, session, attempts=80, seconds=5):
    base = session.get('get_download_url') or (BASE_URL + '/commonapi/report/async/getDownloadUrl.json')
    parsed = urllib.parse.urlparse(base)
    qs = dict(urllib.parse.parse_qsl(parsed.query, keep_blank_values=True))
    qs.update({'taskId': str(task_id), 'bizCode': session.get('bizCode') or 'universalBP'})
    url = urllib.parse.urlunparse(parsed._replace(query=urllib.parse.urlencode(qs)))
    last = None
    for attempt in range(1, attempts + 1):
        data, _ = open_json_request(url, method='GET', headers=headers, timeout=30)
        last = data
        dl = extract_download_url(data)
        if dl:
            return dl, attempt
        status = None
        try:
            status = data.get('data', {}).get('result', {}).get('status')
        except Exception:
            status = None
        print(json.dumps({'event': 'poll', 'taskId': task_id, 'attempt': attempt, 'status': status, 'downloadUrl': bool(dl)}, ensure_ascii=False))
        time.sleep(seconds)
    raise TimeoutError(f'task {task_id} did not produce downloadUrl after {attempts} attempts; last={safe_payload(last)}')


def download_file(url, out_path, headers=None):
    req = urllib.request.Request(url, headers={k: v for k, v in (headers or {}).items() if k.lower() != 'cookie'})
    with urllib.request.urlopen(req, timeout=120) as resp:
        data = resp.read()
    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_bytes(data)
    return len(data)


def cmd_doctor(args):
    print(json.dumps({
        'ok': True,
        'python': sys.version.split()[0],
        'secret_dir': str(Path(args.secret_dir).expanduser()),
        'commands': ['scan-har', 'extract-credentials', 'compare-credentials', 'build-downloads', 'export-month'],
    }, ensure_ascii=False, indent=2))


def cmd_scan_har(args):
    entries = har_entries(args.har)
    templates = collect_download_templates(args.har)
    hosts = {}
    for e in entries:
        _, host, path = request_url_parts(e)
        hosts[host] = hosts.get(host, 0) + 1
    result = {'har': str(args.har), 'entry_count': len(entries), 'hosts_top': sorted(hosts.items(), key=lambda x: x[1], reverse=True)[:20], 'download_template_count': len(templates), 'download_templates': [{k: v for k, v in t.items() if k != 'payload'} for t in templates]}
    if args.out:
        write_json(args.out, result)
    print(json.dumps(result, ensure_ascii=False, indent=2))


def cmd_extract_credentials(args):
    entries = har_entries(args.har)
    best = find_best_request(entries)
    if best is None:
        raise SystemExit('No one.alimama.com request with Cookie found')
    hm = header_map(best.get('request', {}).get('headers'))
    payload = json_post_text(best) or {}
    safe_headers = {}
    for k, v in hm.items():
        if k in SAFE_HEADER_NAMES:
            canonical = '-'.join(part.capitalize() for part in k.split('-'))
            safe_headers[canonical] = v
    safe_headers.setdefault('Origin', BASE_URL)
    safe_headers.setdefault('Referer', BASE_URL + '/index.html')
    cookie_header = hm.get('cookie', '')
    secret_dir = Path(args.secret_dir)
    secret_dir.mkdir(parents=True, exist_ok=True)
    store = args.store
    cookie_names = sorted(parse_cookie_header(cookie_header).keys())
    session = {
        'store': store,
        'source_har_name': Path(args.har).name,
        'base_url': BASE_URL,
        'bizCode': payload.get('bizCode') or 'universalBP',
        'csrfId': payload.get('csrfId'),
        'loginPointId': payload.get('loginPointId'),
        'create_url': find_first_url(entries, '/report/createDownLoadTask.json'),
        'get_download_url': BASE_URL + '/commonapi/report/async/getDownloadUrl.json',
        'find_page_url': BASE_URL + '/commonapi/report/async/findPage.json',
        'cookie_names': cookie_names,
        'created_at': dt.datetime.now().isoformat(timespec='seconds'),
    }
    write_json(secret_dir / f'{store}_cookie.json', {'Cookie': cookie_header}, secret=True)
    write_json(secret_dir / f'{store}_headers.json', safe_headers, secret=True)
    write_json(secret_dir / f'{store}_session.json', session, secret=True)
    compare_summary = None
    if args.compare_qianniu_dir:
        compare_summary = compare_with_qianniu(cookie_header, args.compare_qianniu_dir)
        write_json(secret_dir / f'{store}_qianniu_compare.json', compare_summary, secret=True)
    public = {
        'ok': True,
        'store': store,
        'secret_files': [f'{store}_cookie.json', f'{store}_headers.json', f'{store}_session.json'],
        'cookie_count': len(cookie_names),
        'has_csrfId': bool(session.get('csrfId')),
        'has_loginPointId': bool(session.get('loginPointId')),
        'qianniu_compare': summarize_compare(compare_summary) if compare_summary else None,
    }
    print(json.dumps(public, ensure_ascii=False, indent=2))


def compare_with_qianniu(cookie_header, qn_dir):
    qn_dir = Path(qn_dir)
    results = {}
    for path in sorted(qn_dir.glob('*_cookie.json')):
        try:
            other = read_cookie_json(path)
            results[path.name] = cookie_compare(cookie_header, other)
        except Exception as exc:
            results[path.name] = {'error': str(exc)}
    return results


def summarize_compare(compare):
    if not compare:
        return None
    out = {}
    for name, item in compare.items():
        out[name] = {k: item.get(k) for k in ['identical_cookie_header', 'common_cookie_names_count', 'equal_name_value_count', 'a_cookie_count', 'b_cookie_count']}
    return out


def cmd_compare_credentials(args):
    left = read_cookie_json(args.cookie_file)
    compare = compare_with_qianniu(left, args.qianniu_dir)
    print(json.dumps(summarize_compare(compare), ensure_ascii=False, indent=2))


def cmd_build_downloads(args):
    templates = collect_download_templates(args.har)
    manifest = {
        'source_har': str(args.har),
        'created_at': dt.datetime.now().isoformat(timespec='seconds'),
        'templates': templates,
        'recommended_reports': ['campaign', 'scene', 'bidword'],
        'optional_reports': ['item_promotion', 'crowd'],
        'deferred_reports': ['creative', 'area'],
    }
    write_json(args.out, manifest)
    print(json.dumps({'ok': True, 'out': str(args.out), 'template_count': len(templates), 'reports': [t['report_key'] for t in templates]}, ensure_ascii=False, indent=2))


def load_manifest(path):
    data = load_json(path)
    return data.get('templates') or []


def cmd_export_month(args):
    headers, session = load_secret_bundle(args.store, args.secret_dir)
    templates = load_manifest(args.manifest)
    wanted = [x.strip() for x in args.reports.split(',') if x.strip()]
    selected = [t for t in templates if t.get('report_key') in wanted]
    if args.limit:
        selected = selected[:args.limit]
    if not selected:
        raise SystemExit(f'No matching reports in manifest: {wanted}')
    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    events = []
    for t in selected:
        created = create_task(t, headers, session, args.month, dry_run=args.dry_run)
        events.append({'event': 'create_task', **created})
        print(json.dumps(events[-1], ensure_ascii=False))
        if args.dry_run:
            continue
        dl_url, poll_attempt = poll_download_url(created['taskId'], headers, session, attempts=args.poll_attempts, seconds=args.poll_seconds)
        filename = sanitize_filename(created['excelName']) + '.xlsx'
        out_path = out_dir / filename
        size = download_file(dl_url, out_path, headers=headers)
        valid_xlsx = zipfile.is_zipfile(out_path)
        event = {'event': 'downloaded', 'report_key': t['report_key'], 'taskId': created['taskId'], 'poll_attempt': poll_attempt, 'file': str(out_path), 'bytes': size, 'valid_xlsx': valid_xlsx}
        events.append(event)
        print(json.dumps(event, ensure_ascii=False))
        if not valid_xlsx:
            raise RuntimeError(f'Downloaded file is not valid xlsx: {out_path}')
    write_json(out_dir / 'export_manifest.json', {'store': args.store, 'month': args.month, 'events': events})
    print(json.dumps({'ok': True, 'store': args.store, 'month': args.month, 'out_dir': str(out_dir), 'event_count': len(events)}, ensure_ascii=False, indent=2))


def build_parser():
    p = argparse.ArgumentParser(prog='alimama-export', description='HAR-driven Alimama/Wanxiangtai report exporter')
    p.add_argument('--secret-dir', default=str(SECRET_DIR_DEFAULT))
    sub = p.add_subparsers(dest='cmd', required=True)

    sp = sub.add_parser('doctor')
    sp.set_defaults(func=cmd_doctor)

    sp = sub.add_parser('scan-har')
    sp.add_argument('har')
    sp.add_argument('--out')
    sp.set_defaults(func=cmd_scan_har)

    sp = sub.add_parser('extract-credentials')
    sp.add_argument('har')
    sp.add_argument('--store', required=True)
    sp.add_argument('--compare-qianniu-dir')
    sp.set_defaults(func=cmd_extract_credentials)

    sp = sub.add_parser('compare-credentials')
    sp.add_argument('--cookie-file', required=True)
    sp.add_argument('--qianniu-dir', required=True)
    sp.set_defaults(func=cmd_compare_credentials)

    sp = sub.add_parser('build-downloads')
    sp.add_argument('har')
    sp.add_argument('--out', required=True)
    sp.set_defaults(func=cmd_build_downloads)

    sp = sub.add_parser('export-month')
    sp.add_argument('--month', required=True, help='YYYY-MM')
    sp.add_argument('--store', required=True)
    sp.add_argument('--manifest', required=True)
    sp.add_argument('--reports', default='campaign,scene,bidword')
    sp.add_argument('--out-dir', required=True)
    sp.add_argument('--dry-run', action='store_true')
    sp.add_argument('--limit', type=int)
    sp.add_argument('--poll-attempts', type=int, default=80)
    sp.add_argument('--poll-seconds', type=int, default=5)
    sp.set_defaults(func=cmd_export_month)
    return p


def main(argv=None):
    parser = build_parser()
    args = parser.parse_args(argv)
    args.func(args)


if __name__ == '__main__':
    main()
