#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import json import unicodedata import re import logging import shutil from typing import List, Dict from datetime import datetime from urllib.parse import urljoin from time import sleep from pathlib import Path import requests # ==== CONFIGURATION ==== CONFLUENCE_URL = os.getenv("CONFLUENCE_URL", "https://confluence.local").rstrip("/") BEARER_TOKEN = os.getenv("CONFLUENCE_API_TOKEN") if not BEARER_TOKEN: raise RuntimeError("Il manque l'env var CONFLUENCE_API_TOKEN") BASE_DIR = Path(os.getenv("EXPORT_BASE_DIR", "/repo/confluence")) EXPORT_DIR_PAGES = Path(os.path.join(BASE_DIR, "pages")) EXPORT_DIR_PDFS = Path(os.path.join(BASE_DIR, "pdfs")) SPACE_KEY_FILTER = os.getenv("SPACE_KEY_FILTER", "") # ""=tous, ou préfixe d'espace # ==== LOGGER ==== logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") # ==== SLUGIFY UTILITY ==== def _slugify(s: str) -> str: s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii") return re.sub(r"[^a-z0-9]+", "_", s.lower()).strip("_") or s os.makedirs(EXPORT_DIR_PAGES, exist_ok=True) os.makedirs(EXPORT_DIR_PDFS, exist_ok=True) # ==== REST SESSION (Bearer) ==== sess = requests.Session() sess.headers.update({ "Authorization": f"Bearer {BEARER_TOKEN}", "Content-Type": "application/json" }) # ==== EXPORT PAGES TO JSON via REST ==== TYPES = ["page", "blogpost"] # ─── EXPORT PAGES + PDF (Mono-boucle) ──────────────────────────────────── def export_pages_and_pdfs() -> None: base_api = f"{CONFLUENCE_URL}/" spaces = sess.get(f"{CONFLUENCE_URL}/rest/api/space", params={"limit": 10000, "status": "current"}, timeout=60).json()["results"] total_json, total_pdf = 0, 0 for sp in spaces: key = sp["key"] if SPACE_KEY_FILTER and not key.startswith(SPACE_KEY_FILTER): continue cql = f'space = "{key}" AND type IN ({",".join(TYPES)})' url = f"{CONFLUENCE_URL}/rest/api/content/search" params = {"cql": cql, "limit": 500, "expand": "body.storage,version"} while url: data = sess.get(url, params=params if url.endswith("/search") else None, timeout=60).json() for p in data["results"]: pid, title, ctype = p["id"], p["title"], p["type"] dt = datetime.fromisoformat(p["version"]["when"].replace("Z","+00:00")) ts = dt.timestamp() slug = _slugify(f"{key}_{title}") pdf_fn = f"{dt:%Y%m%d}_{slug}_{pid}.pdf" pdf_dst = EXPORT_DIR_PDFS / pdf_fn # --- téléchargement éventuel du PDF (uniquement pour pages) if ctype == "page": need_dl = not (pdf_dst.exists() and abs(pdf_dst.stat().st_mtime - ts) < 1) if need_dl: pdf_ok = False try: g = sess.get(f"{CONFLUENCE_URL}/spaces/flyingpdf/pdfpageexport.action", params={"pageId": pid}, headers={"X-Atlassian-Token":"no-check"}, allow_redirects=False, timeout=(10,30)) if g.status_code == 302 and "Location" in g.headers: pdf_url = g.headers["Location"] if not pdf_url.startswith("http"): pdf_url = f"{CONFLUENCE_URL}{pdf_url}" for k in range(3): # 3 tentatives maxi try: resp = sess.get(pdf_url, stream=True, timeout=(10,180)) resp.raise_for_status() with pdf_dst.open("wb") as f: shutil.copyfileobj(resp.raw, f) os.utime(pdf_dst, (ts, ts)) logging.info("PDF %s téléchargé → %s", pid, pdf_fn) total_pdf += 1 pdf_ok = True break except (requests.ReadTimeout, requests.ConnectionError) as e: if k == 2: raise sleep(5*(k+1)) else: logging.warning("Pas de PDF pour page %s (HTTP %s)", pid, g.status_code) except Exception as e: logging.error("❌ PDF page %s ignoré : %s", pid, e) # --- écriture JSON (toujours) meta = { "id": pid, "space": key, "title": title, "url": f"{CONFLUENCE_URL}/pages/viewpage.action?pageId={pid}", "last_modified": p["version"]["when"], "html": p["body"]["storage"]["value"], "content_type": ctype, "pdf_filename": pdf_fn if (ctype=="page" and pdf_dst.exists()) else None, } slug = _slugify(f"{key}_{title}") json_path = EXPORT_DIR_PAGES / f"{slug}_{pid}.json" json_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8") total_json += 1 nxt = data["_links"].get("next") url, params = (urljoin(base_api, nxt), None) if nxt else (None, None) logging.info("✅ %d fichiers JSON écrits – %d PDF mis à jour", total_json, total_pdf) if __name__ == "__main__": try: export_pages_and_pdfs() logging.info("Export Confluence terminé.") except Exception as e: logging.error(f"Erreur lors de l'export : {e}", exc_info=True)