#!/usr/bin/env python3 """ generate_markdown_mirrors.py ──────────────────────────── Walks a website folder, finds every index.html, strips chrome/widgets/noise, converts the remaining content to clean Markdown, and writes index.md next to each HTML file. Usage: python generate_markdown_mirrors.py [SITE_ROOT] SITE_ROOT defaults to the current working directory if omitted. Re-runnable: safe to run again after site updates — it overwrites index.md files. """ import os import re import sys import textwrap from datetime import date from pathlib import Path from bs4 import BeautifulSoup, Comment from markdownify import markdownify as md # ── Configuration ──────────────────────────────────────────────────────────── # HTML tags to remove entirely (content + tag) STRIP_TAGS = [ "nav", "footer", "script", "style", "noscript", "iframe", "svg", "canvas", ] # CSS class substrings — any element whose class list contains one of these # (exact match OR starts-with for prefix patterns) gets removed STRIP_CLASS_EXACT = {"nav", "footer", "cta-split"} STRIP_CLASS_PREFIXES = ("ghl", "hs-", "hubspot", "chat-widget", "widget-", "cta_", "leadflows") # IDs to strip (common chat / CRM widget injection points) STRIP_IDS = {"hubspot-messages-iframe-container", "drift-widget", "intercom-container", "crisp-client", "ghl-chat-widget"} # Page paths to skip (relative to site root, forward-slash separated) SKIP_PATH_FRAGMENTS = {"/thanks/", "/404", "/404.html"} # Noindex meta content values that mark a page as excluded NOINDEX_VALUES = {"noindex", "noindex, nofollow", "none"} TODAY = date.today().isoformat() # ── Helpers ─────────────────────────────────────────────────────────────────── def has_strip_class(tag) -> bool: """Return True if any class on *tag* matches our strip rules.""" if tag is None or not hasattr(tag, "get"): return False classes = tag.get("class", []) or [] for cls in classes: if cls in STRIP_CLASS_EXACT: return True if any(cls.startswith(p) for p in STRIP_CLASS_PREFIXES): return True return False def should_skip_page(html_path: Path, site_root: Path) -> tuple[bool, str]: """Return (skip, reason) for a given index.html path.""" rel = html_path.relative_to(site_root).as_posix() # Path-based skips for frag in SKIP_PATH_FRAGMENTS: if frag in f"/{rel}": return True, f"path contains '{frag}'" # Meta-robots noindex check try: soup = BeautifulSoup(html_path.read_text(encoding="utf-8", errors="replace"), "html.parser") except Exception as exc: return True, f"parse error: {exc}" for meta in soup.find_all("meta", attrs={"name": re.compile(r"^robots$", re.I)}): content = (meta.get("content") or "").lower().strip() if content in NOINDEX_VALUES: return True, "noindex meta tag" return False, "" def extract_meta(soup: BeautifulSoup, site_root_url: str, rel_path: str) -> dict: """Pull title, description, and canonical URL from a parsed page.""" title_tag = soup.find("title") title = title_tag.get_text(strip=True) if title_tag else "" desc = "" for meta in soup.find_all("meta"): name = (meta.get("name") or meta.get("property") or "").lower() if name in ("description", "og:description"): desc = (meta.get("content") or "").strip() if desc: break # Canonical URL: prefer , fall back to constructed URL # Note: BS4 returns rel as a list, so search all tags manually canonical_tag = None for link in soup.find_all("link"): rel_attr = link.get("rel", []) # rel_attr may be a list ['canonical'] or string 'canonical' rel_val = rel_attr if isinstance(rel_attr, str) else " ".join(rel_attr) if rel_val.lower() == "canonical": canonical_tag = link break if canonical_tag and canonical_tag.get("href"): url = canonical_tag["href"].rstrip("/") else: # Build from site root + relative path page_dir = Path(rel_path).parent.as_posix() constructed = f"{site_root_url.rstrip('/')}/{page_dir}" url = constructed.rstrip("/") return {"title": title, "description": desc, "url": url} def strip_noise(soup: BeautifulSoup) -> None: """Remove all noisy elements from the soup in-place. IMPORTANT: Always collect targets into a list BEFORE decomposing. Decomposing during a live find_all() iteration creates ghost tags with attrs=None that crash subsequent .get() calls. """ # Remove HTML comments for comment in soup.find_all(string=lambda t: isinstance(t, Comment)): comment.extract() # Remove by tag name — collect first, then decompose for tag_name in STRIP_TAGS: for tag in list(soup.find_all(tag_name)): tag.decompose() # Remove by class — collect first, then decompose targets = [t for t in soup.find_all(True) if has_strip_class(t)] for tag in targets: tag.decompose() # Remove by ID for tag_id in STRIP_IDS: tag = soup.find(id=tag_id) if tag: tag.decompose() # Drop empty div/span/section/article wrappers (no visible text) # Repeat until stable — nested empties need multiple passes changed = True while changed: changed = False for tag in list(soup.find_all(["div", "span", "section", "article"])): if tag.parent is None: # already decomposed in this pass continue if not tag.get_text(strip=True): tag.decompose() changed = True def clean_markdown(raw: str) -> str: """Post-process the markdownify output into clean, readable Markdown.""" lines = raw.splitlines() cleaned = [] for line in lines: # Strip standalone step-number lines like "01", "02", "10" if re.fullmatch(r"\s*\d{1,2}\s*", line): continue # Remove bullet separator characters (✦ • · — ★ ✓ etc.) if re.fullmatch(r"\s*[✦•·—–★✓✗►▶▸◆◇○●\-\*]+\s*", line): continue # Remove empty image markdown: ![](...) or ![ ](...) line = re.sub(r"!\[\s*\]$[^)]*$", "", line) # Remove lines that are now entirely whitespace after substitution cleaned.append(line) result = "\n".join(cleaned) # Collapse 3+ consecutive blank lines to 2 result = re.sub(r"\n{3,}", "\n\n", result) # Strip leading/trailing whitespace from the whole document result = result.strip() return result def build_frontmatter(meta: dict) -> str: """Render YAML frontmatter block.""" def escape(val: str) -> str: # Wrap in quotes if the value contains YAML-special characters if any(c in val for c in (':', '#', '"', "'")): return f'"{val.replace(chr(34), chr(92) + chr(34))}"' return val lines = [ "---", f"title: {escape(meta['title'])}", f"description: {escape(meta['description'])}", f"url: {meta['url']}", f"last_updated: {TODAY}", "---", ] return "\n".join(lines) def html_to_markdown(html_path: Path, site_root: Path, site_root_url: str) -> str: """Full pipeline: HTML file → clean Markdown string with frontmatter.""" raw_html = html_path.read_text(encoding="utf-8", errors="replace") soup = BeautifulSoup(raw_html, "html.parser") rel_path = html_path.relative_to(site_root).as_posix() meta = extract_meta(soup, site_root_url, rel_path) # Work only on content body = soup.find("body") or soup strip_noise(body) # Convert to markdown raw_md = md( str(body), heading_style="ATX", # use # ## ### headings bullets="-", # normalise bullets to - ) clean_md = clean_markdown(raw_md) frontmatter = build_frontmatter(meta) return f"{frontmatter}\n\n{clean_md}" # ── Main ───────────────────────────────────────────────────────────────────── def main(): # Accept site root as CLI arg or default to cwd site_root = Path(sys.argv[1]).resolve() if len(sys.argv) > 1 else Path.cwd() if not site_root.is_dir(): print(f"ERROR: '{site_root}' is not a directory.") sys.exit(1) # Try to detect the canonical site URL from the root index.html root_index = site_root / "index.html" site_root_url = "https://curiositytech.in" # fallback default if root_index.exists(): try: soup = BeautifulSoup(root_index.read_text(encoding="utf-8", errors="replace"), "html.parser") canonical = soup.find("link", rel="canonical") if canonical and canonical.get("href"): from urllib.parse import urlparse parsed = urlparse(canonical["href"]) site_root_url = f"{parsed.scheme}://{parsed.netloc}" except Exception: pass print(f"\n{'─'*60}") print(f" Markdown Mirror Generator") print(f" Site root : {site_root}") print(f" Base URL : {site_root_url}") print(f" Date : {TODAY}") print(f"{'─'*60}\n") # Walk and collect all index.html files all_html = sorted(site_root.rglob("index.html")) print(f"Found {len(all_html)} index.html file(s). Processing...\n") generated = [] skipped = [] for html_path in all_html: rel = html_path.relative_to(site_root).as_posix() skip, reason = should_skip_page(html_path, site_root) if skip: skipped.append((rel, reason)) print(f" SKIP {rel} ({reason})") continue try: content = html_to_markdown(html_path, site_root, site_root_url) out_path = html_path.parent / "index.md" out_path.write_text(content, encoding="utf-8") generated.append(rel) word_count = len(content.split()) print(f" OK {rel} → index.md ({word_count} words)") except Exception as exc: skipped.append((rel, str(exc))) print(f" ERROR {rel} ({exc})") # Summary print(f"\n{'─'*60}") print(f" ✅ Generated : {len(generated)} markdown file(s)") print(f" ⏭️ Skipped : {len(skipped)} page(s)") if skipped: print() print(" Skipped pages:") for path, reason in skipped: print(f" • {path} ({reason})") print(f"{'─'*60}\n") if __name__ == "__main__": main()