"""
Download Chandra Bullet Cluster ObsIDs (canonical 500ks ACIS-I dataset).
ObsIDs from DOI 10.25574/cdc.373: 3184, 4984, 4985, 4986, 5355, 5356, 5357, 5358, 5361

Direct retrieval from https://cxc.cfa.harvard.edu/cdaftp/byobsid/[last_digit]/[obsid]/
No CIAO required.
"""
import os
import sys
import urllib.request
import urllib3
import ssl
from html.parser import HTMLParser

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Set up a context that doesn't verify cert (some Chandra archive certs may be old)
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

OBSIDS = [3184, 4984, 4985, 4986, 5355, 5356, 5357, 5358, 5361]
BASE = "https://cxc.cfa.harvard.edu/cdaftp/byobsid"
OUT_DIR = "Chandra_BulletCluster_cdc373"
os.makedirs(OUT_DIR, exist_ok=True)


class FileLister(HTMLParser):
    def __init__(self):
        super().__init__()
        self.files = []
        self.dirs = []
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            href = dict(attrs).get('href', '')
            if href and not href.startswith('?') and not href.startswith('/'):
                if href.endswith('/'):
                    if href not in ('Parent Directory', '../'):
                        self.dirs.append(href)
                else:
                    self.files.append(href)


def list_remote(url):
    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 BCR-FUM-UM/1.0'})
    with urllib.request.urlopen(req, context=ctx, timeout=60) as r:
        html = r.read().decode('utf-8', errors='replace')
    parser = FileLister()
    parser.feed(html)
    return parser.files, parser.dirs


def download(url, target):
    if os.path.exists(target) and os.path.getsize(target) > 0:
        return os.path.getsize(target), True
    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 BCR-FUM-UM/1.0'})
    with urllib.request.urlopen(req, context=ctx, timeout=300) as r:
        with open(target, 'wb') as f:
            while True:
                chunk = r.read(1024 * 256)
                if not chunk:
                    break
                f.write(chunk)
    return os.path.getsize(target), False


def fetch_obsid(obsid):
    last = obsid % 10
    base_url = f"{BASE}/{last}/{obsid}/"
    obsid_dir = os.path.join(OUT_DIR, str(obsid))
    os.makedirs(obsid_dir, exist_ok=True)

    print(f"\n=== ObsID {obsid} ===")
    print(f"URL: {base_url}")

    # Root-level files
    files, dirs = list_remote(base_url)
    print(f"Root: {len(files)} files, {len(dirs)} subdirs")
    for fn in files:
        if fn.endswith('.fits') or fn.endswith('.fits.gz') or fn == '00README' or fn.endswith('.pdf'):
            url = base_url + fn
            target = os.path.join(obsid_dir, fn)
            sz, skipped = download(url, target)
            tag = 'SKIP' if skipped else 'OK'
            print(f"  [{tag}] {fn}  ({sz/1e6:.1f} MB)")

    # Primary directory
    primary_url = base_url + 'primary/'
    primary_dir = os.path.join(obsid_dir, 'primary')
    os.makedirs(primary_dir, exist_ok=True)
    pf, pd = list_remote(primary_url)
    print(f"primary/: {len(pf)} files")
    for fn in pf:
        if fn.endswith('.fits.gz'):
            url = primary_url + fn
            target = os.path.join(primary_dir, fn)
            sz, skipped = download(url, target)
            tag = 'SKIP' if skipped else 'OK'
            print(f"  [{tag}] primary/{fn}  ({sz/1e6:.1f} MB)")

    # Secondary directory — only event-related files (skip large per-detector cal products)
    secondary_url = base_url + 'secondary/'
    secondary_dir = os.path.join(obsid_dir, 'secondary')
    os.makedirs(secondary_dir, exist_ok=True)
    try:
        sf, sd = list_remote(secondary_url)
        print(f"secondary/: {len(sf)} files (filtering to evt1/asol/bpix/aspsol)")
        for fn in sf:
            if any(s in fn for s in ['_evt1.fits', '_asol1.fits', '_bpix1.fits', '_aspsol1.fits', '_pbk0.fits', '_mtl1.fits']):
                if fn.endswith('.fits.gz') or fn.endswith('.fits'):
                    url = secondary_url + fn
                    target = os.path.join(secondary_dir, fn)
                    sz, skipped = download(url, target)
                    tag = 'SKIP' if skipped else 'OK'
                    print(f"  [{tag}] secondary/{fn}  ({sz/1e6:.1f} MB)")
    except Exception as e:
        print(f"  secondary/ unreachable: {e}")


total_pre = sum(
    os.path.getsize(os.path.join(dp, f))
    for dp, _, fns in os.walk(OUT_DIR) for f in fns
) if os.path.exists(OUT_DIR) else 0

for obsid in OBSIDS:
    try:
        fetch_obsid(obsid)
    except Exception as e:
        print(f"!!! ObsID {obsid} failed: {type(e).__name__}: {e}")
        import traceback; traceback.print_exc()

total_post = sum(
    os.path.getsize(os.path.join(dp, f))
    for dp, _, fns in os.walk(OUT_DIR) for f in fns
)
print(f"\n=== Chandra Summary ===")
print(f"Before: {total_pre/1e9:.2f} GB")
print(f"After:  {total_post/1e9:.2f} GB")
print(f"Delta:  {(total_post-total_pre)/1e9:.2f} GB")