import argparse
import datetime as dt
import csv
import hashlib
import json
import os
import re
import sys
import time
import urllib.parse
import urllib.request
import zipfile
from pathlib import Path

SECRET_DIR_DEFAULT = Path('/Users/bot1/.hermes/profiles/it/secrets/alimama-exporter')
BASE_URL = 'https://one.alimama.com'
COMMONAPI_BASE_URL = 'https://bpcommon.alimama.com'
SENSITIVE_KEYS = {'cookie', 'authorization', 'csrfid', 'csrf', 'token', '_tb_token_', 'loginpointid', 'sign'}
SAFE_HEADER_NAMES = {'accept', 'content-type', 'origin', 'referer', 'user-agent', 'x-requested-with'}
REPORT_PRIORITY = ['scene', 'campaign', 'bidword', 'item_promotion', 'crowd', 'creative', 'area']
REPORT_LABELS = {
    'scene': '营销场景报表',
    'campaign': '计划报表',
    'bidword': '关键词报表',
    'item_promotion': '商品报表',
    'crowd': '人群报表',
    'creative': '创意报表',
    'area': '地域报表',
}


def eprint(*args):
    print(*args, file=sys.stderr)


def load_json(path):
    return json.loads(Path(path).read_text(encoding='utf-8', errors='replace'))


def write_json(path, data, secret=False):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8')
    if secret:
        os.chmod(path, 0o600)
    return path


def header_map(headers):
    return {h.get('name', '').lower(): h.get('value', '') for h in headers or [] if h.get('name')}


def parse_cookie_header(cookie_header):
    out = {}
    for part in (cookie_header or '').split(';'):
        if '=' not in part:
            continue
        k, v = part.split('=', 1)
        k = k.strip()
        if k:
            out[k] = v.strip()
    return out


def cookie_compare(a_header, b_header):
    a = parse_cookie_header(a_header)
    b = parse_cookie_header(b_header)
    common = sorted(set(a) & set(b))
    equal = [k for k in common if a.get(k) == b.get(k)]
    return {
        'a_cookie_count': len(a),
        'b_cookie_count': len(b),
        'common_cookie_names_count': len(common),
        'equal_name_value_count': len(equal),
        'identical_cookie_header': a_header == b_header,
        'common_cookie_names': common,
        'equal_cookie_names': equal,
    }


def read_cookie_json(path):
    data = load_json(path)
    if isinstance(data, dict):
        return data.get('Cookie') or data.get('cookie') or data.get('headers', {}).get('Cookie') or ''
    return ''


def safe_payload(payload):
    if isinstance(payload, dict):
        return {k: ('[REDACTED]' if k.lower() in SENSITIVE_KEYS else safe_payload(v)) for k, v in payload.items()}
    if isinstance(payload, list):
        return [safe_payload(x) for x in payload]
    return payload


def secretless_payload(payload):
    if not isinstance(payload, dict):
        return payload
    return {k: v for k, v in payload.items() if k.lower() not in SENSITIVE_KEYS}


def json_post_text(entry):
    text = ((entry.get('request') or {}).get('postData') or {}).get('text') or ''
    if not text:
        return None
    try:
        return json.loads(text)
    except Exception:
        return None


def har_entries(path):
    return load_json(path).get('log', {}).get('entries', [])


def request_url_parts(entry):
    url = entry.get('request', {}).get('url', '')
    u = urllib.parse.urlparse(url)
    return url, u.netloc, u.path


def find_best_request(entries):
    preferred = []
    fallback = []
    for entry in entries:
        url, host, path = request_url_parts(entry)
        if not host.endswith('one.alimama.com'):
            continue
        hm = header_map(entry.get('request', {}).get('headers'))
        if not hm.get('cookie'):
            continue
        payload = json_post_text(entry) or {}
        score = 0
        if '/report/createDownLoadTask.json' in path:
            score += 100
        if path.startswith('/report/'):
            score += 40
        if 'csrfId' in payload:
            score += 10
        if 'loginPointId' in payload:
            score += 10
        item = (score, entry)
        if score:
            preferred.append(item)
        else:
            fallback.append(item)
    if preferred:
        return sorted(preferred, key=lambda x: x[0], reverse=True)[0][1]
    if fallback:
        return fallback[0][1]
    return None


def find_first_url(entries, path_suffix):
    for entry in entries:
        url, host, path = request_url_parts(entry)
        if host.endswith('one.alimama.com') and path == path_suffix:
            return url
    return BASE_URL + path_suffix


def infer_report_key(payload):
    rpt = str(payload.get('rptType') or '')
    qd = payload.get('queryDomains') or []
    excel = str(payload.get('excelName') or '')
    if rpt == 'account' and 'scene' in qd:
        return 'scene'
    if rpt == 'campaign':
        return 'campaign'
    if rpt == 'bidword':
        return 'bidword'
    if rpt == 'item_promotion' or '商品报表' in excel:
        return 'item_promotion'
    if rpt == 'crowd':
        return 'crowd'
    if rpt == 'creative':
        return 'creative'
    if rpt == 'area':
        return 'area'
    if '营销场景' in excel:
        return 'scene'
    if '计划' in excel:
        return 'campaign'
    if '关键词' in excel:
        return 'bidword'
    return rpt or 'unknown'


def collect_download_templates(har_path):
    templates = []
    seen = set()
    for idx, entry in enumerate(har_entries(har_path)):
        url, host, path = request_url_parts(entry)
        if not host.endswith('one.alimama.com') or path != '/report/createDownLoadTask.json':
            continue
        payload = json_post_text(entry)
        if not isinstance(payload, dict):
            continue
        report_key = infer_report_key(payload)
        nonsecret = secretless_payload(payload)
        key = (report_key, json.dumps(nonsecret, ensure_ascii=False, sort_keys=True))
        if key in seen:
            continue
        seen.add(key)
        templates.append({
            'report_key': report_key,
            'report_label': REPORT_LABELS.get(report_key, payload.get('excelName') or report_key),
            'har_entry_index': idx,
            'endpoint_path': '/report/createDownLoadTask.json',
            'payload': nonsecret,
            'payload_preview': safe_payload(payload),
            'original_excel_name': payload.get('excelName'),
            'rptType': payload.get('rptType'),
            'queryDomains': payload.get('queryDomains'),
            'startTime': payload.get('startTime'),
            'endTime': payload.get('endTime'),
            'splitType': payload.get('splitType'),
        })
    templates.sort(key=lambda x: REPORT_PRIORITY.index(x['report_key']) if x['report_key'] in REPORT_PRIORITY else 99)
    return templates


def month_range(month):
    y, m = map(int, month.split('-', 1))
    start = dt.date(y, m, 1)
    if m == 12:
        end = dt.date(y + 1, 1, 1) - dt.timedelta(days=1)
    else:
        end = dt.date(y, m + 1, 1) - dt.timedelta(days=1)
    return start.isoformat(), end.isoformat()


def sanitize_filename(name):
    name = re.sub(r'[\\/:*?"<>|\s]+', '_', str(name)).strip('_')
    return name or 'alimama_report'


def load_secret_bundle(store, secret_dir):
    secret_dir = Path(secret_dir)
    cookie_path = secret_dir / f'{store}_cookie.json'
    headers_path = secret_dir / f'{store}_headers.json'
    session_path = secret_dir / f'{store}_session.json'
    if not cookie_path.exists() or not headers_path.exists() or not session_path.exists():
        raise SystemExit(f'MISSING credentials for store={store}; run extract-credentials first')
    cookie = read_cookie_json(cookie_path)
    headers = load_json(headers_path)
    session = load_json(session_path)
    h = {k: v for k, v in headers.items() if v}
    h['Cookie'] = cookie
    return h, session


def open_json_request(url, method='GET', headers=None, payload=None, timeout=30):
    headers = headers or {}
    data = None
    if payload is not None:
        data = json.dumps(payload, ensure_ascii=False, separators=(',', ':')).encode('utf-8')
        headers = dict(headers)
        headers.setdefault('Content-Type', 'application/json;charset=UTF-8')
    req = urllib.request.Request(url, data=data, headers=headers, method=method)
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        body = resp.read()
        ctype = resp.headers.get('Content-Type', '')
        text = body.decode('utf-8', errors='replace')
        try:
            return json.loads(text), resp.headers
        except Exception:
            raise RuntimeError(f'non-json response status={resp.status} ctype={ctype} preview={text[:200]!r}')


def extract_download_url(obj):
    if isinstance(obj, dict):
        for key in ['downloadUrl', 'url', 'fileUrl', 'ossUrl']:
            val = obj.get(key)
            if isinstance(val, str) and val.startswith(('http://', 'https://')):
                return val
        for val in obj.values():
            found = extract_download_url(val)
            if found:
                return found
    elif isinstance(obj, list):
        for val in obj:
            found = extract_download_url(val)
            if found:
                return found
    return None


def create_task(template, headers, session, month, dry_run=False, split_type='day'):
    start, end = month_range(month)
    payload = dict(template['payload'])
    payload['startTime'] = start
    payload['endTime'] = end
    # Do not carry UI comparison-period fields from the captured HAR into monthly exports.
    # Leaving vsType/vsTime in place made the platform return rolling comparison ranges.
    for key in list(payload.keys()):
        if key.startswith('vsType') or key.startswith('vsTime') or key == '_sum':
            payload.pop(key, None)
    payload['splitType'] = split_type
    # The captured bidword HAR was weekly; for monthly report exports default to daily rows
    # so downstream monthly builder can aggregate exact natural-month data.
    payload['csrfId'] = session.get('csrfId')
    payload['loginPointId'] = session.get('loginPointId')
    payload.setdefault('bizCode', session.get('bizCode') or 'universalBP')
    payload['source'] = payload.get('source') or 'async_dowdload'
    label = REPORT_LABELS.get(template['report_key'], template['report_key'])
    payload['excelName'] = f"{label}_{month.replace('-', '')}_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}"
    url = session.get('create_url') or (BASE_URL + '/report/createDownLoadTask.json')
    if dry_run:
        return {'dry_run': True, 'report_key': template['report_key'], 'excelName': payload['excelName'], 'payload_preview': safe_payload(payload)}
    data, _ = open_json_request(url, method='POST', headers=headers, payload=payload, timeout=60)
    task_id = (((data or {}).get('data') or {}).get('taskId'))
    ok = (((data or {}).get('info') or {}).get('ok'))
    if not task_id:
        raise RuntimeError(f'create task failed for {template["report_key"]}: ok={ok} response={safe_payload(data)}')
    return {'taskId': task_id, 'report_key': template['report_key'], 'excelName': payload['excelName'], 'response_ok': ok}


def poll_download_url(task_id, headers, session, attempts=80, seconds=5):
    base = session.get('get_download_url') or (COMMONAPI_BASE_URL + '/commonapi/report/async/getDownloadUrl.json')
    parsed = urllib.parse.urlparse(base)
    if parsed.netloc == 'one.alimama.com' and parsed.path.startswith('/commonapi/'):
        parsed = parsed._replace(scheme='https', netloc='bpcommon.alimama.com')
    qs = dict(urllib.parse.parse_qsl(parsed.query, keep_blank_values=True))
    qs.update({'taskId': str(task_id), 'bizCode': session.get('bizCode') or 'universalBP'})
    url = urllib.parse.urlunparse(parsed._replace(query=urllib.parse.urlencode(qs)))
    last = None
    for attempt in range(1, attempts + 1):
        data, _ = open_json_request(url, method='GET', headers=headers, timeout=30)
        last = data
        dl = extract_download_url(data)
        if dl:
            return dl, attempt
        status = None
        try:
            status = data.get('data', {}).get('result', {}).get('status')
        except Exception:
            status = None
        print(json.dumps({'event': 'poll', 'taskId': task_id, 'attempt': attempt, 'status': status, 'downloadUrl': bool(dl)}, ensure_ascii=False))
        time.sleep(seconds)
    raise TimeoutError(f'task {task_id} did not produce downloadUrl after {attempts} attempts; last={safe_payload(last)}')


def download_file(url, out_path, headers=None):
    # OSS signed URLs can reject the original one.alimama.com request headers.
    # Keep only User-Agent; omit Cookie/Origin/Referer/Accept/etc.
    minimal_headers = {}
    if headers and headers.get('User-Agent'):
        minimal_headers['User-Agent'] = headers['User-Agent']
    req = urllib.request.Request(url, headers=minimal_headers)
    with urllib.request.urlopen(req, timeout=120) as resp:
        data = resp.read()
    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_bytes(data)
    return len(data)


def decode_csv_bytes(raw):
    for enc in ['utf-8-sig', 'gb18030', 'gbk']:
        try:
            return raw.decode(enc), enc
        except UnicodeDecodeError:
            pass
    return raw.decode('utf-8', errors='replace'), 'utf-8-replace'


def normalize_download_artifact(raw_path, stem):
    """Return safe event details and produce a real .xlsx when platform returns zip-of-csv."""
    raw_path = Path(raw_path)
    out_dir = raw_path.parent
    result = {'raw_file': str(raw_path), 'raw_bytes': raw_path.stat().st_size}
    if zipfile.is_zipfile(raw_path):
        with zipfile.ZipFile(raw_path) as z:
            names = z.namelist()
            result['zip_names'] = names
            if '[Content_Types].xml' in names:
                xlsx_path = out_dir / f'{stem}.xlsx'
                if raw_path != xlsx_path:
                    xlsx_path.write_bytes(raw_path.read_bytes())
                result.update({'file': str(xlsx_path), 'valid_xlsx': True, 'kind': 'xlsx'})
                return result
            csv_names = [n for n in names if n.lower().endswith('.csv')]
            if csv_names:
                csv_bytes = z.read(csv_names[0])
                text, enc = decode_csv_bytes(csv_bytes)
                csv_path = out_dir / f'{stem}.csv'
                csv_path.write_text(text, encoding='utf-8')
                result.update({'csv_file': str(csv_path), 'csv_encoding': enc, 'kind': 'zip_csv'})
                try:
                    import openpyxl
                    wb = openpyxl.Workbook()
                    ws = wb.active
                    ws.title = 'data'
                    for row in csv.reader(text.splitlines()):
                        ws.append(row)
                    xlsx_path = out_dir / f'{stem}.xlsx'
                    wb.save(xlsx_path)
                    result.update({'file': str(xlsx_path), 'valid_xlsx': zipfile.is_zipfile(xlsx_path), 'xlsx_converted': True})
                except Exception as exc:
                    result.update({'file': str(csv_path), 'valid_xlsx': False, 'xlsx_converted': False, 'xlsx_error': str(exc)})
                return result
    # Fallback: keep raw bytes as .xlsx only if it is an actual xlsx zip.
    result.update({'file': str(raw_path), 'valid_xlsx': False, 'kind': 'raw'})
    return result


def cmd_doctor(args):
    print(json.dumps({
        'ok': True,
        'python': sys.version.split()[0],
        'secret_dir': str(Path(args.secret_dir).expanduser()),
        'commands': ['scan-har', 'extract-credentials', 'compare-credentials', 'build-downloads', 'export-month'],
    }, ensure_ascii=False, indent=2))


def cmd_scan_har(args):
    entries = har_entries(args.har)
    templates = collect_download_templates(args.har)
    hosts = {}
    for e in entries:
        _, host, path = request_url_parts(e)
        hosts[host] = hosts.get(host, 0) + 1
    result = {'har': str(args.har), 'entry_count': len(entries), 'hosts_top': sorted(hosts.items(), key=lambda x: x[1], reverse=True)[:20], 'download_template_count': len(templates), 'download_templates': [{k: v for k, v in t.items() if k != 'payload'} for t in templates]}
    if args.out:
        write_json(args.out, result)
    print(json.dumps(result, ensure_ascii=False, indent=2))


def cmd_extract_credentials(args):
    entries = har_entries(args.har)
    best = find_best_request(entries)
    if best is None:
        raise SystemExit('No one.alimama.com request with Cookie found')
    hm = header_map(best.get('request', {}).get('headers'))
    payload = json_post_text(best) or {}
    safe_headers = {}
    for k, v in hm.items():
        if k in SAFE_HEADER_NAMES:
            canonical = '-'.join(part.capitalize() for part in k.split('-'))
            safe_headers[canonical] = v
    safe_headers.setdefault('Origin', BASE_URL)
    safe_headers.setdefault('Referer', BASE_URL + '/index.html')
    cookie_header = hm.get('cookie', '')
    secret_dir = Path(args.secret_dir)
    secret_dir.mkdir(parents=True, exist_ok=True)
    store = args.store
    cookie_names = sorted(parse_cookie_header(cookie_header).keys())
    session = {
        'store': store,
        'source_har_name': Path(args.har).name,
        'base_url': BASE_URL,
        'bizCode': payload.get('bizCode') or 'universalBP',
        'csrfId': payload.get('csrfId'),
        'loginPointId': payload.get('loginPointId'),
        'create_url': find_first_url(entries, '/report/createDownLoadTask.json'),
        'get_download_url': COMMONAPI_BASE_URL + '/commonapi/report/async/getDownloadUrl.json',
        'find_page_url': COMMONAPI_BASE_URL + '/commonapi/report/async/findPage.json',
        'cookie_names': cookie_names,
        'created_at': dt.datetime.now().isoformat(timespec='seconds'),
    }
    write_json(secret_dir / f'{store}_cookie.json', {'Cookie': cookie_header}, secret=True)
    write_json(secret_dir / f'{store}_headers.json', safe_headers, secret=True)
    write_json(secret_dir / f'{store}_session.json', session, secret=True)
    compare_summary = None
    if args.compare_qianniu_dir:
        compare_summary = compare_with_qianniu(cookie_header, args.compare_qianniu_dir)
        write_json(secret_dir / f'{store}_qianniu_compare.json', compare_summary, secret=True)
    public = {
        'ok': True,
        'store': store,
        'secret_files': [f'{store}_cookie.json', f'{store}_headers.json', f'{store}_session.json'],
        'cookie_count': len(cookie_names),
        'has_csrfId': bool(session.get('csrfId')),
        'has_loginPointId': bool(session.get('loginPointId')),
        'qianniu_compare': summarize_compare(compare_summary) if compare_summary else None,
    }
    print(json.dumps(public, ensure_ascii=False, indent=2))


def compare_with_qianniu(cookie_header, qn_dir):
    qn_dir = Path(qn_dir)
    results = {}
    for path in sorted(qn_dir.glob('*_cookie.json')):
        try:
            other = read_cookie_json(path)
            results[path.name] = cookie_compare(cookie_header, other)
        except Exception as exc:
            results[path.name] = {'error': str(exc)}
    return results


def summarize_compare(compare):
    if not compare:
        return None
    out = {}
    for name, item in compare.items():
        out[name] = {k: item.get(k) for k in ['identical_cookie_header', 'common_cookie_names_count', 'equal_name_value_count', 'a_cookie_count', 'b_cookie_count']}
    return out


def cmd_compare_credentials(args):
    left = read_cookie_json(args.cookie_file)
    compare = compare_with_qianniu(left, args.qianniu_dir)
    print(json.dumps(summarize_compare(compare), ensure_ascii=False, indent=2))


def cmd_build_downloads(args):
    templates = collect_download_templates(args.har)
    manifest = {
        'source_har': str(args.har),
        'created_at': dt.datetime.now().isoformat(timespec='seconds'),
        'templates': templates,
        'recommended_reports': ['campaign', 'scene', 'bidword'],
        'optional_reports': ['item_promotion', 'crowd'],
        'deferred_reports': ['creative', 'area'],
    }
    write_json(args.out, manifest)
    print(json.dumps({'ok': True, 'out': str(args.out), 'template_count': len(templates), 'reports': [t['report_key'] for t in templates]}, ensure_ascii=False, indent=2))


def load_manifest(path):
    data = load_json(path)
    return data.get('templates') or []


def cmd_export_month(args):
    headers, session = load_secret_bundle(args.store, args.secret_dir)
    templates = load_manifest(args.manifest)
    wanted = [x.strip() for x in args.reports.split(',') if x.strip()]
    selected = [t for t in templates if t.get('report_key') in wanted]
    if args.limit:
        selected = selected[:args.limit]
    if not selected:
        raise SystemExit(f'No matching reports in manifest: {wanted}')
    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    events = []
    for t in selected:
        created = create_task(t, headers, session, args.month, dry_run=args.dry_run, split_type=args.split_type)
        events.append({'event': 'create_task', **created})
        print(json.dumps(events[-1], ensure_ascii=False))
        if args.dry_run:
            continue
        dl_url, poll_attempt = poll_download_url(created['taskId'], headers, session, attempts=args.poll_attempts, seconds=args.poll_seconds)
        stem = sanitize_filename(created['excelName'])
        raw_path = out_dir / f'{stem}.zip'
        size = download_file(dl_url, raw_path, headers=headers)
        artifact = normalize_download_artifact(raw_path, stem)
        event = {'event': 'downloaded', 'report_key': t['report_key'], 'taskId': created['taskId'], 'poll_attempt': poll_attempt, 'bytes': size, **artifact}
        events.append(event)
        print(json.dumps(event, ensure_ascii=False))
        if not event.get('valid_xlsx'):
            raise RuntimeError(f'Downloaded artifact could not be converted to a valid xlsx: {event}')
    write_json(out_dir / 'export_manifest.json', {'store': args.store, 'month': args.month, 'events': events})
    print(json.dumps({'ok': True, 'store': args.store, 'month': args.month, 'out_dir': str(out_dir), 'event_count': len(events)}, ensure_ascii=False, indent=2))


def build_parser():
    p = argparse.ArgumentParser(prog='alimama-export', description='HAR-driven Alimama/Wanxiangtai report exporter')
    p.add_argument('--secret-dir', default=str(SECRET_DIR_DEFAULT))
    sub = p.add_subparsers(dest='cmd', required=True)

    sp = sub.add_parser('doctor')
    sp.set_defaults(func=cmd_doctor)

    sp = sub.add_parser('scan-har')
    sp.add_argument('har')
    sp.add_argument('--out')
    sp.set_defaults(func=cmd_scan_har)

    sp = sub.add_parser('extract-credentials')
    sp.add_argument('har')
    sp.add_argument('--store', required=True)
    sp.add_argument('--compare-qianniu-dir')
    sp.set_defaults(func=cmd_extract_credentials)

    sp = sub.add_parser('compare-credentials')
    sp.add_argument('--cookie-file', required=True)
    sp.add_argument('--qianniu-dir', required=True)
    sp.set_defaults(func=cmd_compare_credentials)

    sp = sub.add_parser('build-downloads')
    sp.add_argument('har')
    sp.add_argument('--out', required=True)
    sp.set_defaults(func=cmd_build_downloads)

    sp = sub.add_parser('export-month')
    sp.add_argument('--month', required=True, help='YYYY-MM')
    sp.add_argument('--store', required=True)
    sp.add_argument('--manifest', required=True)
    sp.add_argument('--reports', default='campaign,scene,bidword')
    sp.add_argument('--out-dir', required=True)
    sp.add_argument('--dry-run', action='store_true')
    sp.add_argument('--split-type', default='day', choices=['day', 'month'], help='day is default for exact natural-month aggregation; month follows the UI month split')
    sp.add_argument('--limit', type=int)
    sp.add_argument('--poll-attempts', type=int, default=80)
    sp.add_argument('--poll-seconds', type=int, default=5)
    sp.set_defaults(func=cmd_export_month)
    return p


def main(argv=None):
    parser = build_parser()
    args = parser.parse_args(argv)
    args.func(args)


if __name__ == '__main__':
    main()
