Function bodies 108 total
_build_toc function · python · L108-L122 (15 LOC)src/scrawl/assemble/markdown.py
def _build_toc(documents: list[DocumentInfo], config) -> str:
"""Build table of contents."""
lines = ["# Table of Contents\n"]
ordered = _order_documents(documents, config)
current_type = None
for doc in ordered:
if doc.doc_type != current_type:
current_type = doc.doc_type
label = DOC_TYPE_LABELS.get(current_type, current_type)
lines.append(f"- **{label}**")
date_str = f" ({doc.filing_date.isoformat()})" if doc.filing_date else ""
lines.append(f" - [{doc.filename}](#{_slugify(doc.filename)}){date_str}")
return "\n".join(lines)_order_documents function · python · L125-L134 (10 LOC)src/scrawl/assemble/markdown.py
def _order_documents(documents: list[DocumentInfo], config) -> list[DocumentInfo]:
"""Order documents by type group, then chronologically within each group."""
type_order = {t: i for i, t in enumerate(config.assemble.document_type_order)}
def sort_key(doc):
type_rank = type_order.get(doc.doc_type, 999)
date_rank = doc.filing_date or date.min
return (type_rank, date_rank)
return sorted(documents, key=sort_key)_slugify function · python · L137-L142 (6 LOC)src/scrawl/assemble/markdown.py
def _slugify(text: str) -> str:
"""Convert text to URL-safe slug for Markdown anchors."""
slug = text.lower()
slug = re.sub(r"[^\w\s-]", "", slug)
slug = re.sub(r"[\s]+", "-", slug)
return slug.strip("-")cli function · python · L19-L32 (14 LOC)src/scrawl/cli.py
def cli(ctx, config):
"""Scrawl — OCR pipeline for Social Security Disability case files."""
from .config import Config
ctx.ensure_object(dict)
if config:
ctx.obj["config"] = Config.from_yaml(Path(config))
else:
# Try default config, fall back to defaults
default_config = Path("config/default.yaml")
if default_config.exists():
ctx.obj["config"] = Config.from_yaml(default_config)
else:
ctx.obj["config"] = Config()process function · python · L43-L56 (14 LOC)src/scrawl/cli.py
def process(ctx, case_dir, output, case_id, skip_anonymize):
"""Process all PDFs in a case directory."""
from .pipeline import process_case
config = ctx.obj["config"]
output_file = process_case(
case_dir=Path(case_dir),
output_dir=Path(output),
config=config,
case_id=case_id,
skip_anonymize=skip_anonymize,
)
click.echo(f"Done. Output: {output_file}")triage function · python · L62-L86 (25 LOC)src/scrawl/cli.py
def triage(ctx, pdf_file):
"""Classify pages in a PDF (diagnostic mode)."""
from .triage.classifier import classify_document
config = ctx.obj["config"]
classifications = classify_document(Path(pdf_file), config)
click.echo(f"File: {Path(pdf_file).name}")
click.echo(f"Pages: {len(classifications)}")
click.echo()
for c in classifications:
markers = []
if c.has_tables:
markers.append("tables")
if c.has_glyphless_font:
markers.append("glyphless")
markers_str = f" [{', '.join(markers)}]" if markers else ""
click.echo(
f" Page {c.page_num + 1:3d}: {c.page_type.value:20s} "
f"conf={c.confidence:.2f} "
f"text={c.text_length:5d} "
f"img={c.image_coverage:.1%}{markers_str}"
)generate_key function · python · L90-L97 (8 LOC)src/scrawl/cli.py
def generate_key():
"""Generate a Fernet encryption key for mapping store."""
from cryptography.fernet import Fernet
key = Fernet.generate_key().decode()
click.echo(f"Generated key: {key}")
click.echo("Set as environment variable:")
click.echo(f" export SCRAWL_MAP_KEY='{key}'")Repobility · open methodology · https://repobility.com/research/
serve function · python · L105-L117 (13 LOC)src/scrawl/cli.py
def serve(ctx, host, port, data_dir):
"""Start the Scrawl web interface."""
import uvicorn
from .web.app import create_app
from .web.storage import CaseStore
config = ctx.obj["config"]
store = CaseStore(Path(data_dir))
app = create_app(store=store, config=config)
click.echo(f"Starting Scrawl web interface at http://{host}:{port}")
uvicorn.run(app, host=host, port=port)TriageConfig class · python · L10-L13 (4 LOC)src/scrawl/config.py
class TriageConfig(BaseModel):
image_coverage_threshold: float = 0.95
min_text_length: int = 50
detect_glyphless_font: bool = TrueBornDigitalConfig class · python · L16-L18 (3 LOC)src/scrawl/config.py
class BornDigitalConfig(BaseModel):
engine: Literal["pymupdf4llm"] = "pymupdf4llm"
page_chunks: bool = TrueScannedTypedConfig class · python · L21-L25 (5 LOC)src/scrawl/config.py
class ScannedTypedConfig(BaseModel):
engine: Literal["marker", "docling", "tesseract"] = "docling"
force_ocr: bool = True
paginate_output: bool = True
dpi: int = 300HandwrittenConfig class · python · L28-L32 (5 LOC)src/scrawl/config.py
class HandwrittenConfig(BaseModel):
engine: Literal["trocr", "florence2", "got-ocr"] = "trocr"
model: str = "microsoft/trocr-base-handwritten"
line_detector: Literal["craft", "surya", "doctr"] = "craft"
min_confidence: float = 0.3TableConfig class · python · L35-L38 (4 LOC)src/scrawl/config.py
class TableConfig(BaseModel):
text_engine: Literal["pdfplumber"] = "pdfplumber"
image_engine: Literal["tatr", "surya", "docling"] = "tatr"
output_format: Literal["markdown", "html", "json"] = "markdown"ExtractConfig class · python · L41-L45 (5 LOC)src/scrawl/config.py
class ExtractConfig(BaseModel):
born_digital: BornDigitalConfig = BornDigitalConfig()
scanned_typed: ScannedTypedConfig = ScannedTypedConfig()
handwritten: HandwrittenConfig = HandwrittenConfig()
tables: TableConfig = TableConfig()PseudonymizationFormat class · python · L48-L58 (11 LOC)src/scrawl/config.py
class PseudonymizationFormat(BaseModel):
person: str = "[PERSON-{:03d}]"
provider: str = "[PROVIDER-{:03d}]"
facility: str = "[FACILITY-{:03d}]"
ssn: str = "[SSN-REDACTED]"
dob: str = "[DOB-REDACTED]"
address: str = "[ADDRESS-{:03d}]"
phone: str = "[PHONE-REDACTED]"
email: str = "[EMAIL-REDACTED]"
mrn: str = "[MRN-REDACTED]"
date: str = "[DATE-{:03d}]"About: code-quality intelligence by Repobility · https://repobility.com
SelectiveConfig class · python · L61-L73 (13 LOC)src/scrawl/config.py
class SelectiveConfig(BaseModel):
anonymize_categories: list[str] = [
"claimant",
"treating_providers",
"medical_facilities",
"personal_identifiers",
]
preserve_categories: list[str] = [
"judges",
"case_law_parties",
"commissioners",
"legal_citations",
]MappingEncryptionConfig class · python · L76-L79 (4 LOC)src/scrawl/config.py
class MappingEncryptionConfig(BaseModel):
enabled: bool = True
method: Literal["fernet"] = "fernet"
key_env_var: str = "SS_OCR_MAP_KEY"PresidioConfig class · python · L82-L95 (14 LOC)src/scrawl/config.py
class PresidioConfig(BaseModel):
score_threshold: float = 0.65
entities: list[str] = [
"PERSON",
"PHONE_NUMBER",
"EMAIL_ADDRESS",
"US_SSN",
"DATE_TIME",
"LOCATION",
"MEDICAL_LICENSE",
"US_DRIVER_LICENSE",
"IP_ADDRESS",
"URL",
]AnonymizeConfig class · python · L98-L107 (10 LOC)src/scrawl/config.py
class AnonymizeConfig(BaseModel):
method: Literal["safe_harbor"] = "safe_harbor"
mode: Literal["selective", "uniform"] = "selective"
selective: SelectiveConfig = SelectiveConfig()
ner_backend: Literal["spacy", "transformer", "gliner"] = "spacy"
spacy_model: str = "en_core_web_lg"
presidio: PresidioConfig = PresidioConfig()
pseudonymization: PseudonymizationFormat = PseudonymizationFormat()
fuzzy_match_threshold: float = 0.85
mapping_encryption: MappingEncryptionConfig = MappingEncryptionConfig()AssembleConfig class · python · L110-L126 (17 LOC)src/scrawl/config.py
class AssembleConfig(BaseModel):
ordering: Literal["type_then_chrono", "chronological", "manifest"] = "type_then_chrono"
document_type_order: list[str] = [
"alj_decision",
"hearing_transcript",
"medical_records",
"consultative_exam",
"function_report",
"district_court_opinion",
"appellate_opinion",
"brief",
"other",
]
include_toc: bool = True
include_frontmatter: bool = True
confidence_warning_threshold: float = 0.70
page_separator: str = "---"PipelineConfig class · python · L129-L133 (5 LOC)src/scrawl/config.py
class PipelineConfig(BaseModel):
max_workers: int = 4
stage_isolation: bool = True
mps_enabled: bool = True
temp_dir: str = "/tmp/scrawl"Config class · python · L136-L147 (12 LOC)src/scrawl/config.py
class Config(BaseModel):
pipeline: PipelineConfig = PipelineConfig()
triage: TriageConfig = TriageConfig()
extract: ExtractConfig = ExtractConfig()
anonymize: AnonymizeConfig = AnonymizeConfig()
assemble: AssembleConfig = AssembleConfig()
@classmethod
def from_yaml(cls, path: Path) -> "Config":
with open(path) as f:
data = yaml.safe_load(f)
return cls(**data)from_yaml method · python · L144-L147 (4 LOC)src/scrawl/config.py
def from_yaml(cls, path: Path) -> "Config":
with open(path) as f:
data = yaml.safe_load(f)
return cls(**data)Methodology: Repobility · https://repobility.com/research/state-of-ai-code-2026/
extract_born_digital function · python · L10-L51 (42 LOC)src/scrawl/extract/born_digital.py
def extract_born_digital(
pdf_path: Path,
page_nums: list[int] | None = None,
) -> list[ExtractionResult]:
"""Extract markdown from born-digital PDF pages.
Uses pymupdf4llm which converts directly to Markdown with:
- Table preservation (as Markdown tables)
- Header detection (## style)
- Bold/italic preservation
- ~0.12s/page processing speed
Args:
pdf_path: Path to the PDF file
page_nums: Specific pages to extract (0-indexed). None = all pages.
Returns:
List of ExtractionResult, one per page.
"""
results = []
chunks = pymupdf4llm.to_markdown(
str(pdf_path),
page_chunks=True,
pages=page_nums,
)
for chunk in chunks:
# pymupdf4llm returns 1-based page numbers; convert to 0-based
page_num = chunk["metadata"]["page"] - 1
markdown_text = chunk["text"]
results.append(
ExtractionResult(
page_num=page_num,
extract_scanned_typed function · python · L8-L17 (10 LOC)src/scrawl/extract/scanned_typed.py
def extract_scanned_typed(
pdf_path: Path,
page_nums: list[int],
engine: str = "docling",
dpi: int = 300,
) -> list[ExtractionResult]:
if engine == "docling":
return _extract_with_docling(pdf_path, page_nums)
else:
raise ValueError(f"Unknown scanned_typed engine: {engine}")_extract_with_docling function · python · L20-L41 (22 LOC)src/scrawl/extract/scanned_typed.py
def _extract_with_docling(pdf_path: Path, page_nums: list[int]) -> list[ExtractionResult]:
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(str(pdf_path))
md = result.document.export_to_markdown()
# Docling processes the whole document — we split by page if possible
# For now, return the full markdown as a single result per requested page
results = []
for page_num in page_nums:
results.append(
ExtractionResult(
page_num=page_num,
markdown=md,
confidence=0.85, # Docling doesn't provide per-page confidence
engine="docling",
warnings=[],
)
)
return resultsPageType class · python · L8-L12 (5 LOC)src/scrawl/models.py
class PageType(str, Enum):
BORN_DIGITAL = "born_digital"
SCANNED_TYPED = "scanned_typed"
SCANNED_HANDWRITTEN = "scanned_handwritten"
BLANK = "blank"PageClassification class · python · L16-L23 (8 LOC)src/scrawl/models.py
class PageClassification:
page_num: int
page_type: PageType
has_tables: bool
image_coverage: float
text_length: int
has_glyphless_font: bool
confidence: floatExtractionResult class · python · L27-L32 (6 LOC)src/scrawl/models.py
class ExtractionResult:
page_num: int
markdown: str
confidence: float
engine: str
warnings: list[str] = field(default_factory=list)DocumentResult class · python · L36-L39 (4 LOC)src/scrawl/models.py
class DocumentResult:
source_path: Path
pages: list[ExtractionResult] = field(default_factory=list)
metadata: dict = field(default_factory=dict)process_case function · python · L17-L140 (124 LOC)src/scrawl/pipeline.py
def process_case(
case_dir: Path,
output_dir: Path,
config: Config,
case_id: str | None = None,
skip_anonymize: bool = False,
progress_callback: Callable[[int, str, int, int], None] | None = None,
) -> Path:
"""Run the full pipeline on a case directory.
Args:
case_dir: Directory containing PDF files
output_dir: Where to write the output Markdown
config: Pipeline configuration
case_id: Override case ID (defaults to directory name)
skip_anonymize: Skip anonymization stage (for testing)
progress_callback: Optional callback(stage, stage_name, current, total)
called to report per-item progress within each pipeline stage.
Returns:
Path to the output Markdown file.
"""
case_dir = Path(case_dir)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
if case_id is None:
case_id = case_dir.name
pdf_files = sorted(case_dir.glob("*.pdf"))
Citation: Repobility (2026). State of AI-Generated Code. https://repobility.com/research/
_guess_document_type function · python · L143-L165 (23 LOC)src/scrawl/pipeline.py
def _guess_document_type(filename: str) -> str:
"""Guess document type from filename.
Simple keyword matching — can be overridden by a manifest file.
"""
lower = filename.lower()
if "opinion" in lower or "memorandum" in lower:
return "district_court_opinion"
if "decision" in lower or "alj" in lower:
return "alj_decision"
if "transcript" in lower or "hearing" in lower:
return "hearing_transcript"
if "medical" in lower or "records" in lower:
return "medical_records"
if "consultative" in lower or "ce " in lower:
return "consultative_exam"
if "function" in lower or "report" in lower:
return "function_report"
if "brief" in lower:
return "brief"
if "appeal" in lower or "appellate" in lower:
return "appellate_opinion"
return "other"_extract_date_from_filename function · python · L168-L189 (22 LOC)src/scrawl/pipeline.py
def _extract_date_from_filename(filename: str):
"""Extract date from filename like '20240830 Memorandum Opinion.pdf'."""
import re
from datetime import date
# Try YYYYMMDD pattern at start
match = re.match(r"(\d{4})(\d{2})(\d{2})", filename)
if match:
try:
return date(int(match.group(1)), int(match.group(2)), int(match.group(3)))
except ValueError:
pass
# Try YYYY-MM-DD pattern
match = re.search(r"(\d{4})-(\d{2})-(\d{2})", filename)
if match:
try:
return date(int(match.group(1)), int(match.group(2)), int(match.group(3)))
except ValueError:
pass
return Noneclassify_page function · python · L11-L71 (61 LOC)src/scrawl/triage/classifier.py
def classify_page(
page: pymupdf.Page,
image_coverage_threshold: float = 0.95,
min_text_length: int = 50,
detect_glyphless: bool = True,
) -> PageClassification:
"""Classify a single PDF page.
Decision tree:
1. Extract text and compute image coverage ratio
2. If text is substantial and no GlyphlessFont -> born_digital
3. If image covers >=95% of page -> scanned (typed by default)
4. If little text and little image -> blank
"""
text = page.get_text("text").strip()
text_length = len(text)
# Compute image coverage
blocks = page.get_text("dict")["blocks"]
image_area = sum(pymupdf.Rect(b["bbox"]).get_area() for b in blocks if b["type"] == 1)
page_area = page.rect.get_area()
image_coverage = image_area / page_area if page_area > 0 else 0.0
# Detect GlyphlessFont (pre-OCR'd scan with invisible text layer)
has_glyphless = False
if detect_glyphless:
fonts = page.get_fonts()
has_glyphless = anyclassify_document function · python · L74-L89 (16 LOC)src/scrawl/triage/classifier.py
def classify_document(pdf_path: Path, config: Config) -> list[PageClassification]:
"""Classify all pages in a PDF document."""
doc = pymupdf.open(str(pdf_path))
classifications = []
for page in doc:
cls = classify_page(
page,
image_coverage_threshold=config.triage.image_coverage_threshold,
min_text_length=config.triage.min_text_length,
detect_glyphless=config.triage.detect_glyphless_font,
)
classifications.append(cls)
doc.close()
return classificationscreate_app function · python · L18-L188 (171 LOC)src/scrawl/web/app.py
def create_app(store: CaseStore | None = None, config=None) -> FastAPI:
if store is None:
store = CaseStore(Path("data/cases"))
if config is None:
from scrawl.config import Config
config = Config()
app = FastAPI(title="Scrawl")
templates = Jinja2Templates(directory=str(TEMPLATES_DIR))
def highlight_redactions(text):
"""Replace [TYPE-NNN] tokens with colored spans."""
def _replace(match):
token = match.group(0)
if "PERSON" in token or "PROVIDER" in token:
css = "redaction-person"
elif "DATE" in token or "DOB" in token:
css = "redaction-date"
elif "ADDRESS" in token or "LOCATION" in token:
css = "redaction-address"
else:
css = "redaction-other"
return f'<span class="{css} px-1 py-0.5 rounded text-xs font-medium">{token}</span>'
return re.sub(r"\[[A-Z]+-(?:\d{3}|REDACTED)\]", _replacPipelineEvent class · python · L26-L32 (7 LOC)src/scrawl/web/runner.py
class PipelineEvent:
stage: int
stage_name: str
status: str # "running", "completed", "failed"
error: str | None = None
progress: int = 0
total: int = 0PipelineRunner class · python · L35-L159 (125 LOC)src/scrawl/web/runner.py
class PipelineRunner:
def __init__(self, store: CaseStore, config: Config):
self.store = store
self.config = config
self._running: dict[str, bool] = {}
self._events: dict[str, list[PipelineEvent]] = defaultdict(list)
self._subscribers: dict[str, list[asyncio.Queue]] = defaultdict(list)
self._lock = threading.Lock()
def is_running(self, case_id: str) -> bool:
with self._lock:
return self._running.get(case_id, False)
def start(self, case_id: str):
with self._lock:
if self._running.get(case_id, False):
return
self._running[case_id] = True
self._events[case_id] = []
thread = threading.Thread(
target=self._run_pipeline,
args=(case_id,),
daemon=True,
)
thread.start()
def get_events(self, case_id: str) -> list[PipelineEvent]:
with self._lock:
return list(self._events.g__init__ method · python · L36-L42 (7 LOC)src/scrawl/web/runner.py
def __init__(self, store: CaseStore, config: Config):
self.store = store
self.config = config
self._running: dict[str, bool] = {}
self._events: dict[str, list[PipelineEvent]] = defaultdict(list)
self._subscribers: dict[str, list[asyncio.Queue]] = defaultdict(list)
self._lock = threading.Lock()Repobility · open methodology · https://repobility.com/research/
is_running method · python · L44-L46 (3 LOC)src/scrawl/web/runner.py
def is_running(self, case_id: str) -> bool:
with self._lock:
return self._running.get(case_id, False)start method · python · L48-L60 (13 LOC)src/scrawl/web/runner.py
def start(self, case_id: str):
with self._lock:
if self._running.get(case_id, False):
return
self._running[case_id] = True
self._events[case_id] = []
thread = threading.Thread(
target=self._run_pipeline,
args=(case_id,),
daemon=True,
)
thread.start()get_events method · python · L62-L64 (3 LOC)src/scrawl/web/runner.py
def get_events(self, case_id: str) -> list[PipelineEvent]:
with self._lock:
return list(self._events.get(case_id, []))subscribe method · python · L66-L72 (7 LOC)src/scrawl/web/runner.py
def subscribe(self, case_id: str) -> asyncio.Queue:
queue: asyncio.Queue = asyncio.Queue()
with self._lock:
self._subscribers[case_id].append(queue)
for event in self._events.get(case_id, []):
queue.put_nowait(event)
return queueunsubscribe method · python · L74-L78 (5 LOC)src/scrawl/web/runner.py
def unsubscribe(self, case_id: str, queue: asyncio.Queue):
with self._lock:
subs = self._subscribers.get(case_id, [])
if queue in subs:
subs.remove(queue)_publish method · python · L80-L87 (8 LOC)src/scrawl/web/runner.py
def _publish(self, case_id: str, event: PipelineEvent):
with self._lock:
self._events[case_id].append(event)
for queue in self._subscribers.get(case_id, []):
try:
queue.put_nowait(event)
except asyncio.QueueFull:
pass_run_pipeline method · python · L89-L159 (71 LOC)src/scrawl/web/runner.py
def _run_pipeline(self, case_id: str):
try:
input_dir = self.store.input_dir(case_id)
output_dir = input_dir.parent
last_stage = [0] # track last stage for store updates
def on_progress(stage: int, stage_name: str, current: int, total: int):
if stage != last_stage[0]:
last_stage[0] = stage
self.store.update_status(
case_id, "processing", pipeline_stage=stage
)
self._publish(
case_id,
PipelineEvent(
stage=stage,
stage_name=stage_name,
status="running",
progress=current,
total=total,
),
)
output_file = process_case(
case_dir=input_dir,
output_dir=output_dir,
cCaseMeta class · python · L19-L25 (7 LOC)src/scrawl/web/storage.py
class CaseMeta:
id: str
name: str
created_at: str
status: str = "created"
pipeline_stage: int = 0
stats: dict = field(default_factory=dict)About: code-quality intelligence by Repobility · https://repobility.com
CaseStore class · python · L28-L112 (85 LOC)src/scrawl/web/storage.py
class CaseStore:
def __init__(self, base_dir: Path):
self.base_dir = Path(base_dir)
self.base_dir.mkdir(parents=True, exist_ok=True)
def create_case(self, name: str) -> CaseMeta:
case_id = uuid.uuid4().hex[:12]
case_dir = self.base_dir / case_id
case_dir.mkdir(parents=True)
(case_dir / "input").mkdir()
meta = CaseMeta(
id=case_id,
name=name,
created_at=datetime.now(timezone.utc).isoformat(),
)
self._write_meta(case_id, meta)
return meta
def list_cases(self) -> list[CaseMeta]:
cases = []
if not self.base_dir.exists():
return cases
for case_dir in sorted(self.base_dir.iterdir()):
meta_file = case_dir / "meta.json"
if meta_file.exists():
cases.append(self._read_meta(meta_file))
return cases
def get_case(self, case_id: str) -> CaseMeta | None:
meta_file = self.base___init__ method · python · L29-L31 (3 LOC)src/scrawl/web/storage.py
def __init__(self, base_dir: Path):
self.base_dir = Path(base_dir)
self.base_dir.mkdir(parents=True, exist_ok=True)create_case method · python · L33-L45 (13 LOC)src/scrawl/web/storage.py
def create_case(self, name: str) -> CaseMeta:
case_id = uuid.uuid4().hex[:12]
case_dir = self.base_dir / case_id
case_dir.mkdir(parents=True)
(case_dir / "input").mkdir()
meta = CaseMeta(
id=case_id,
name=name,
created_at=datetime.now(timezone.utc).isoformat(),
)
self._write_meta(case_id, meta)
return meta