← back to kyuwon-shim-ARL__papersift

Function bodies 69 total

All specs Real LLM only Function bodies
run_enrich function · python · L221-L241 (21 LOC)
src/papersift/cli.py
def run_enrich(args):
    """Execute enrich command."""
    try:
        from papersift.enrich import OpenAlexEnricher
    except ImportError:
        print("Error: enrichment requires pyalex. Install with: pip install papersift[enrich]",
              file=sys.stderr)
        sys.exit(1)

    papers = load_papers(args.input)
    fields = [f.strip() for f in args.fields.split(',')]

    enricher = OpenAlexEnricher(email=args.email)
    enriched = enricher.enrich_papers(papers, fields=fields)

    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(enriched, f, indent=2)

    print(f"Saved: {output_path}")
run_find function · python · L244-L292 (49 LOC)
src/papersift/cli.py
def run_find(args):
    """Execute find command."""
    from papersift import EntityLayerBuilder

    papers = load_papers(args.input)
    use_topics = getattr(args, 'use_topics', False)
    builder = EntityLayerBuilder(use_topics=use_topics)
    builder.build_from_papers(papers)

    if args.hubs:
        hubs = builder.find_hub_papers(top_k=args.hubs)
        if args.format == "json":
            # JSON output
            output = []
            for h in hubs:
                title = get_title(papers, h['doi'])
                output.append({
                    'doi': h['doi'],
                    'title': title,
                    'hub_score': h['hub_score'],
                    'entities': h['entities']
                })
            print(json.dumps(output, indent=2))
        else:
            # Table output (default)
            print(f"Top {args.hubs} Entity Hub Papers:")
            print("-" * 60)
            for i, h in enumerate(hubs, 1):
                title = get_title(
run_stream function · python · L295-L334 (40 LOC)
src/papersift/cli.py
def run_stream(args):
    """Execute stream command."""
    from papersift import EntityLayerBuilder

    papers = load_papers(args.input)
    use_topics = getattr(args, 'use_topics', False)
    builder = EntityLayerBuilder(use_topics=use_topics)
    builder.build_from_papers(papers)

    if args.expand:
        reachable = builder.expand_from_seed(args.seed, hops=args.hops)
        if args.format == "json":
            # JSON output
            output = []
            for doi in reachable:
                title = get_title(papers, doi)
                output.append({'doi': doi, 'title': title})
            print(json.dumps(output, indent=2))
        else:
            # Table output (default)
            print(f"Papers reachable in {args.hops} hops from seed: {len(reachable)}")
            for doi in list(reachable)[:20]:
                title = get_title(papers, doi)
                print(f"  - {title[:60]}...")
    else:
        path = builder.entity_stream(args.seed, strategy=args.s
load_papers function · python · L337-L358 (22 LOC)
src/papersift/cli.py
def load_papers(path):
    """Load papers from file or stdin.

    Args:
        path: File path or "-" for stdin

    Returns:
        List of paper dicts
    """
    if path == "-":
        if sys.stdin.isatty():
            print("Error: No input on stdin. Use '-' only when piping data.", file=sys.stderr)
            sys.exit(1)
        try:
            data = json.load(sys.stdin)
        except json.JSONDecodeError:
            print("Error: Invalid JSON input on stdin", file=sys.stderr)
            sys.exit(1)
    else:
        with open(path) as f:
            data = json.load(f)
    return data.get('papers', data) if isinstance(data, dict) else data
run_cluster function · python · L365-L436 (72 LOC)
src/papersift/cli.py
def run_cluster(args):
    """Execute clustering command."""
    from papersift import EntityLayerBuilder, ClusterValidator

    # Load papers
    papers = load_papers(args.input)
    print(f"Loaded {len(papers)} papers")

    # Build entity graph and cluster
    use_topics = getattr(args, 'use_topics', False)
    mode = "Title + OpenAlex Topics" if use_topics else "Title-only"
    print(f"Building entity graph ({mode})...")
    builder = EntityLayerBuilder(use_topics=use_topics)
    builder.build_from_papers(papers)
    print(f"  Graph: {builder.graph.vcount()} nodes, {builder.graph.ecount()} edges")

    print(f"Running Leiden clustering (resolution={args.resolution}, seed={args.seed})...")
    clusters = builder.run_leiden(resolution=args.resolution, seed=args.seed)
    num_clusters = len(set(clusters.values()))
    print(f"  Found {num_clusters} clusters")

    # Generate summaries
    summaries = builder.get_cluster_summary(clusters)

    # Create output directory
    output_dir =
run_ui function · python · L439-L461 (23 LOC)
src/papersift/cli.py
def run_ui(args):
    """Launch interactive UI or export static HTML."""
    # Mutual exclusion check
    if args.export and args.host != "127.0.0.1":
        print("Error: --export and --host cannot be used together.", file=sys.stderr)
        print("  --export generates a static file, no server is started.", file=sys.stderr)
        sys.exit(1)

    if args.export:
        # Export mode
        from papersift.ui.exporter import export_network_html
        export_network_html(args.input, args.export, mode=getattr(args, 'mode', 'cluster'))
    else:
        # Server mode
        try:
            from papersift.ui.app import run_server
        except ImportError:
            print("Error: UI requires additional dependencies. Install with: pip install -r requirements-ui.txt",
                  file=sys.stderr)
            sys.exit(1)

        run_server(args.input, port=args.port, debug=args.debug, host=args.host,
                   use_topics=getattr(args, 'use_topics', False))
run_browse function · python · L464-L534 (71 LOC)
src/papersift/cli.py
def run_browse(args):
    """Browse cluster contents in text mode."""
    from papersift import EntityLayerBuilder

    papers = load_papers(args.input)
    use_topics = getattr(args, 'use_topics', False)
    builder = EntityLayerBuilder(use_topics=use_topics)
    builder.build_from_papers(papers)

    clusters = builder.run_leiden(resolution=args.resolution, seed=42)
    summaries = builder.get_cluster_summary(clusters)
    # Sort by size descending
    summaries.sort(key=lambda s: s['size'], reverse=True)

    # Default to --list if no specific action
    if not args.cluster and not args.export:
        args.list = True

    if args.list:
        _browse_list(summaries, len(papers), args.format)

    if args.cluster:
        cluster_ids = [int(x.strip()) for x in args.cluster.split(',')]
        _browse_detail(summaries, cluster_ids, papers, args.full, args.format)

        if args.export:
            _browse_export(summaries, cluster_ids, papers, args.export)

    if getattr(args, '
Want fix-PRs on findings? Install Repobility's GitHub App · github.com/apps/repobility-bot
_browse_list function · python · L537-L554 (18 LOC)
src/papersift/cli.py
def _browse_list(summaries, total_papers, format_type="table"):
    """Print cluster list summary."""
    if format_type == "json":
        # JSON output
        output = []
        for s in summaries:
            output.append({
                'cluster_id': s['cluster_id'],
                'size': s['size'],
                'top_entities': s['top_entities']
            })
        print(json.dumps(output, indent=2))
    else:
        # Table output (default)
        print(f"{len(summaries)} clusters found ({total_papers} papers total)\n")
        for s in summaries:
            entities = ', '.join(s['top_entities'][:5])
            print(f"Cluster {s['cluster_id']} ({s['size']} papers): {entities}")
_browse_detail function · python · L557-L608 (52 LOC)
src/papersift/cli.py
def _browse_detail(summaries, cluster_ids, papers, full=False, format_type="table"):
    """Print detailed cluster info."""
    summary_map = {s['cluster_id']: s for s in summaries}

    if format_type == "json":
        # JSON output
        output = []
        for cid in cluster_ids:
            if cid not in summary_map:
                continue
            s = summary_map[cid]
            # Get sample papers with details
            sample_papers = []
            for doi in s['dois'][:3]:
                title = get_title(papers, doi)
                year = next((p.get('year', '?') for p in papers if p['doi'] == doi), '?')
                sample_papers.append({'doi': doi, 'title': title, 'year': year})

            output.append({
                'cluster_id': cid,
                'size': s['size'],
                'top_entities': s['top_entities'],
                'sample_papers': sample_papers,
                'dois': s['dois'] if full else s['dois'][:10]
            })
        p
_browse_export function · python · L611-L625 (15 LOC)
src/papersift/cli.py
def _browse_export(summaries, cluster_ids, papers, output_path):
    """Export selected clusters to JSON."""
    summary_map = {s['cluster_id']: s for s in summaries}
    selected_dois = set()
    for cid in cluster_ids:
        if cid in summary_map:
            selected_dois.update(summary_map[cid]['dois'])

    selected_papers = [p for p in papers if p['doi'] in selected_dois]

    with open(output_path, 'w') as f:
        json.dump(selected_papers, f, indent=2)

    print(f"Selected {len(cluster_ids)} clusters ({len(selected_papers)} papers total)")
    print(f"Exported to: {output_path}")
run_landscape function · python · L628-L726 (99 LOC)
src/papersift/cli.py
def run_landscape(args):
    """Generate landscape visualization as HTML."""
    from papersift.embedding import embed_papers
    from papersift import EntityLayerBuilder

    papers = load_papers(args.input)
    use_topics = getattr(args, 'use_topics', False)
    print(f"Loaded {len(papers)} papers", file=sys.stderr)

    # Cluster for coloring
    builder = EntityLayerBuilder(use_topics=use_topics)
    builder.build_from_papers(papers)
    clusters = builder.run_leiden(resolution=args.resolution, seed=args.seed)
    num_clusters = len(set(clusters.values()))
    print(f"Found {num_clusters} clusters", file=sys.stderr)

    # Compute embedding
    print(f"Computing {args.method.upper()} embedding...", file=sys.stderr)

    # Auto-adjust perplexity for t-SNE with small sample sizes
    kwargs = {}
    if args.method == "tsne":
        max_perplexity = (len(papers) - 1) / 3.0
        if max_perplexity < 30.0:
            kwargs['perplexity'] = max(5.0, max_perplexity)
            print(
run_filter function · python · L729-L824 (96 LOC)
src/papersift/cli.py
def run_filter(args):
    """Filter papers by entity, cluster, or DOI list."""
    from papersift import EntityLayerBuilder

    papers = load_papers(args.input)
    matching_dois = set(p['doi'] for p in papers)  # Start with all

    # Entity filter
    if args.entity:
        use_topics = getattr(args, 'use_topics', False)
        builder = EntityLayerBuilder(use_topics=use_topics)
        builder.build_from_papers(papers)

        entity_matches = []
        for entity_name in args.entity:
            found = set(builder.find_papers_by_entity(entity_name))
            entity_matches.append(found)

        if getattr(args, 'entity_any', False):
            # OR: union
            entity_set = set()
            for s in entity_matches:
                entity_set.update(s)
        else:
            # AND: intersection
            entity_set = entity_matches[0]
            for s in entity_matches[1:]:
                entity_set &= s

        matching_dois &= entity_set

    # Cluster fi
run_merge function · python · L827-L849 (23 LOC)
src/papersift/cli.py
def run_merge(args):
    """Merge multiple paper JSON files, deduplicate by DOI."""
    all_papers = []
    seen_dois = set()

    for input_path in args.inputs:
        papers = load_papers(input_path)
        for p in papers:
            if p['doi'] not in seen_dois:
                all_papers.append(p)
                seen_dois.add(p['doi'])

    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(all_papers, f, indent=2)

    total_input = sum(len(load_papers(p)) for p in args.inputs)
    deduped = total_input - len(all_papers)
    print(f"Merged {len(args.inputs)} files: {total_input} papers -> {len(all_papers)} unique", file=sys.stderr)
    if deduped > 0:
        print(f"  Removed {deduped} duplicates", file=sys.stderr)
    print(f"Saved: {output_path}", file=sys.stderr)
run_subcluster function · python · L852-L913 (62 LOC)
src/papersift/cli.py
def run_subcluster(args):
    """Sub-cluster a specific cluster using the standalone sub_cluster function."""
    from papersift.embedding import sub_cluster

    papers = load_papers(args.input)

    with open(args.clusters_from) as f:
        clusters = json.load(f)

    use_topics = getattr(args, 'use_topics', False)
    target_cid = args.cluster

    # Try to match as int if possible (clusters.json values are often ints)
    try:
        target_cid_int = int(target_cid)
        if any(v == target_cid_int for v in clusters.values()):
            target_cid = target_cid_int
    except ValueError:
        pass

    try:
        sub_results = sub_cluster(
            papers, target_cid, clusters,
            resolution=args.resolution,
            seed=args.seed,
            use_topics=use_topics,
        )
    except ValueError as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)

    # Count sub-clusters
    from collections import Counter
    sub_counts = Counter(
extract_paper_entities function · python · L20-L35 (16 LOC)
src/papersift/embedding.py
def extract_paper_entities(
    papers: List[Dict[str, Any]],
    use_topics: bool = False,
) -> Dict[str, set]:
    """Extract entity sets for each paper using a temporary EntityLayerBuilder.

    Args:
        papers: List of paper dicts with 'doi' and 'title' fields.
        use_topics: If True, also use OpenAlex topics as entities.

    Returns:
        Mapping of DOI to set of lowercase entity names.
    """
    builder = EntityLayerBuilder(use_topics=use_topics)
    builder.build_from_papers(papers)
    return builder.paper_entities
Source: Repobility analyzer · https://repobility.com
build_entity_matrix function · python · L38-L75 (38 LOC)
src/papersift/embedding.py
def build_entity_matrix(
    papers: List[Dict[str, Any]],
    paper_entities: Dict[str, set],
) -> Tuple[np.ndarray, List[str], List[str]]:
    """Build a binary entity-presence matrix.

    Rows correspond to papers (ordered by DOI appearance in *papers*),
    columns correspond to unique entities (sorted alphabetically).
    Papers with no entities receive an all-zero row.

    Args:
        papers: List of paper dicts (used for DOI ordering).
        paper_entities: Mapping of DOI to entity set (from extract_paper_entities).

    Returns:
        Tuple of (matrix, doi_list, entity_list) where matrix has shape
        (n_papers, n_entities) and dtype float32.
    """
    doi_list = [p["doi"] for p in papers]

    # Collect all unique entities across all papers
    all_entities: Set[str] = set()
    for entities in paper_entities.values():
        all_entities.update(entities)
    entity_list = sorted(all_entities)

    entity_index = {ent: i for i, ent in enumerate(entity_list)}
   
compute_embedding function · python · L78-L133 (56 LOC)
src/papersift/embedding.py
def compute_embedding(
    matrix: np.ndarray,
    method: str = "umap",
    n_components: int = 2,
    random_state: int = 42,
    **kwargs: Any,
) -> np.ndarray:
    """Reduce an entity-presence matrix to a low-dimensional embedding.

    Args:
        matrix: 2-D array of shape (n_papers, n_entities).
        method: ``"umap"`` or ``"tsne"``.
        n_components: Target dimensionality (default 2).
        random_state: Seed for reproducibility.
        **kwargs: Forwarded to the underlying reducer constructor.

    Returns:
        ndarray of shape (n_papers, n_components).

    Raises:
        ValueError: If matrix has fewer than 2 rows.
        ImportError: If method is ``"umap"`` and umap-learn is not installed.
        ValueError: If method is not ``"umap"`` or ``"tsne"``.
    """
    if matrix.shape[0] < 2:
        raise ValueError(
            f"compute_embedding requires at least 2 rows, got {matrix.shape[0]}"
        )

    if method == "umap":
        try:
            impo
embed_papers function · python · L136-L191 (56 LOC)
src/papersift/embedding.py
def embed_papers(
    papers: List[Dict[str, Any]],
    method: str = "umap",
    use_topics: bool = False,
    random_state: int = 42,
    **kwargs: Any,
) -> Dict[str, Tuple[float, float]]:
    """High-level: papers in, {doi: (x, y)} out.

    Internally chains extract_paper_entities -> build_entity_matrix ->
    compute_embedding, then maps coordinates back to DOIs.

    Papers whose entity set is empty are placed at the centroid of the
    embedding with small random jitter so they remain visible but do not
    distort the layout.

    Args:
        papers: List of paper dicts with 'doi' and 'title' fields.
        method: ``"umap"`` or ``"tsne"``.
        use_topics: If True, also use OpenAlex topics as entities.
        random_state: Seed for reproducibility.
        **kwargs: Forwarded to compute_embedding.

    Returns:
        Mapping of DOI to (x, y) coordinate tuple.
    """
    paper_entities = extract_paper_entities(papers, use_topics=use_topics)
    matrix, doi_list, enti
sub_cluster function · python · L194-L261 (68 LOC)
src/papersift/embedding.py
def sub_cluster(
    papers: List[Dict[str, Any]],
    cluster_id: Union[int, str],
    clusters: Dict[str, Union[int, str]],
    resolution: float = 1.0,
    seed: Optional[int] = None,
    use_topics: bool = False,
) -> Dict[str, str]:
    """Hierarchical sub-clustering within an existing cluster.

    Filters papers to those belonging to *cluster_id*, builds a new entity
    graph from the subset, runs Leiden, and returns membership with
    hierarchical IDs of the form ``"{cluster_id}.{sub_id}"``.

    If only one paper belongs to the cluster, or Leiden finds only a single
    sub-cluster, the original cluster_id is returned unchanged (as a string).

    Args:
        papers: Full list of paper dicts.
        cluster_id: The cluster to sub-divide.
        clusters: Existing DOI -> cluster_id mapping.
        resolution: Leiden resolution for the sub-clustering.
        seed: Random seed for Leiden.
        use_topics: If True, also use OpenAlex topics as entities.

    Returns:
   
OpenAlexEnricher.__init__ method · python · L28-L34 (7 LOC)
src/papersift/enrich.py
    def __init__(self, email: str):
        """
        Args:
            email: Contact email for OpenAlex polite pool (faster rate limits).
        """
        pyalex.config.email = email
        self.email = email
OpenAlexEnricher.enrich_papers method · python · L36-L106 (71 LOC)
src/papersift/enrich.py
    def enrich_papers(
        self,
        papers: List[Dict[str, Any]],
        fields: Optional[List[str]] = None,
        progress: bool = True,
    ) -> List[Dict[str, Any]]:
        """
        Enrich papers with OpenAlex data.

        Args:
            papers: List of paper dicts with 'doi' key.
            fields: Fields to fetch. Default: ['referenced_works', 'openalex_id'].
                    Supported: 'referenced_works', 'openalex_id', 'topics', 'abstract'.
            progress: Show progress output.

        Returns:
            Papers list with requested fields added.
        """
        if fields is None:
            fields = ['referenced_works', 'openalex_id']

        papers_with_doi = [(i, p) for i, p in enumerate(papers) if p.get('doi')]
        total = len(papers_with_doi)

        if progress:
            print(f"Enriching {total} papers (of {len(papers)} total)...")

        enriched_count = 0
        for idx, (i, paper) in enumerate(papers_with_doi):
         
OpenAlexEnricher._fetch_work method · python · L108-L120 (13 LOC)
src/papersift/enrich.py
    def _fetch_work(self, doi: str) -> Optional[Dict]:
        """Fetch a single work from OpenAlex by DOI."""
        try:
            # Normalize DOI to URL format for OpenAlex
            if not doi.startswith('http'):
                doi_url = f"https://doi.org/{doi}"
            else:
                doi_url = doi

            work = Works()[doi_url]
            return work
        except Exception:
            return None
OpenAlexEnricher._resolve_openalex_ids_to_dois method · python · L122-L157 (36 LOC)
src/papersift/enrich.py
    def _resolve_openalex_ids_to_dois(self, openalex_ids: List[str]) -> List[str]:
        """
        Batch-resolve OpenAlex work IDs to DOIs.

        Args:
            openalex_ids: List of OpenAlex URLs like 'https://openalex.org/W1234567'.

        Returns:
            List of DOIs (strings). IDs that can't be resolved are omitted.
        """
        dois = []

        # Process in batches
        for batch_start in range(0, len(openalex_ids), self.BATCH_SIZE):
            batch = openalex_ids[batch_start:batch_start + self.BATCH_SIZE]

            # Build pipe-separated filter
            id_filter = "|".join(batch)

            try:
                results = Works().filter(openalex_id=id_filter).get()
                for work in results:
                    doi = work.get('doi')
                    if doi:
                        # OpenAlex returns DOIs as URLs, strip prefix
                        if doi.startswith('https://doi.org/'):
                            doi = doi[len
Methodology: Repobility · https://repobility.com/research/state-of-ai-code-2026/
OpenAlexEnricher._reconstruct_abstract method · python · L160-L173 (14 LOC)
src/papersift/enrich.py
    def _reconstruct_abstract(work: Dict) -> Optional[str]:
        """Reconstruct abstract from OpenAlex inverted index format."""
        inv_index = work.get('abstract_inverted_index')
        if not inv_index:
            return None

        # Inverted index: {word: [positions]}
        word_positions = []
        for word, positions in inv_index.items():
            for pos in positions:
                word_positions.append((pos, word))

        word_positions.sort(key=lambda x: x[0])
        return " ".join(word for _, word in word_positions)
ImprovedEntityExtractor.__init__ method · python · L60-L134 (75 LOC)
src/papersift/entity_layer.py
    def __init__(self):
        # Methods - use word boundaries
        self.methods = [
            'scGPT', 'transformer', 'transformers', 'LSTM', 'CNN', 'RNN', 'GRU',
            'neural network', 'deep learning', 'machine learning', 'ML', 'DL', 'AI',
            'random forest', 'support vector', 'SVM', 'clustering', 'k-means',
            'classification', 'regression', 'ensemble', 'boosting', 'XGBoost',
            'reinforcement learning', 'RL',
            'GAN', 'VAE', 'autoencoder', 'diffusion model',
            'attention mechanism', 'self-attention', 'BERT', 'GPT', 'LLM',
            'foundation model', 'language model', 'embedding', 'representation learning',
            'transfer learning', 'fine-tuning', 'pre-training', 'pretraining',
            'zero-shot', 'few-shot', 'contrastive learning',
            'graph neural network', 'GNN', 'graph convolutional', 'GCN',
            'message passing', 'node embedding',
            'simulation', 'optimization', 'algorithm',
 
ImprovedEntityExtractor._compile_patterns method · python · L136-L161 (26 LOC)
src/papersift/entity_layer.py
    def _compile_patterns(self):
        """Pre-compile regex patterns for efficient matching."""
        self.method_patterns = []
        for method in self.methods:
            # Escape special chars and add word boundaries
            pattern = re.compile(r'\b' + re.escape(method.lower()) + r'\b', re.IGNORECASE)
            self.method_patterns.append((method, pattern))

        self.organism_patterns = []
        for organism in self.organisms:
            pattern = re.compile(r'\b' + re.escape(organism.lower()) + r'\b', re.IGNORECASE)
            self.organism_patterns.append((organism, pattern))

        # Add 'rat' separately with strict word boundary (not in 'generative', etc.)
        # \brat\b will only match standalone "rat"
        self.organism_patterns.append(('rat', re.compile(r'\brat\b', re.IGNORECASE)))

        self.concept_patterns = []
        for concept in self.concepts:
            pattern = re.compile(r'\b' + re.escape(concept.lower()) + r'\b', re.IGNORECASE)
 
ImprovedEntityExtractor.extract_entities method · python · L163-L225 (63 LOC)
src/papersift/entity_layer.py
    def extract_entities(self, title: str, category: str) -> List[Dict[str, str]]:
        """
        Extract entities from title using word-boundary regex matching.

        Args:
            title: Paper title to extract entities from
            category: Paper category (currently unused, for future expansion)

        Returns:
            List of {"name": str, "type": str} dicts
        """
        entities = []
        seen = set()

        # Extract methods
        for method, pattern in self.method_patterns:
            if pattern.search(title):
                key = method.lower()
                if key not in seen:
                    entities.append({"name": method, "type": "METHOD"})
                    seen.add(key)

        # Extract organisms
        for organism, pattern in self.organism_patterns:
            if pattern.search(title):
                key = organism.lower()
                if key not in seen:
                    entities.append({"name": organism, "type":
EntityLayerBuilder.__init__ method · python · L236-L248 (13 LOC)
src/papersift/entity_layer.py
    def __init__(self, use_topics: bool = False):
        """
        Initialize the entity layer builder.

        Args:
            use_topics: If True, also use OpenAlex topics from paper['topics'] as entities.
                       Requires enriched paper data with 'topics' field.
        """
        self.extractor = ImprovedEntityExtractor()
        self.use_topics = use_topics
        self.graph: Optional[ig.Graph] = None
        self._paper_entities: Dict[str, set] = {}  # doi -> set(entity_names)
        self._dois: List[str] = []
EntityLayerBuilder._extract_entities_for_paper method · python · L250-L279 (30 LOC)
src/papersift/entity_layer.py
    def _extract_entities_for_paper(self, paper: Dict[str, Any]) -> Set[str]:
        """
        Extract entities from a paper, optionally including topics.

        Args:
            paper: Paper dict with 'title', optional 'category', optional 'topics'

        Returns:
            Set of lowercase entity names
        """
        # Rule-based entities from title
        entities = self.extractor.extract_entities(
            paper['title'],
            paper.get('category', '')
        )
        entity_set = {e['name'].lower() for e in entities}

        # Add OpenAlex topics if enabled
        if self.use_topics:
            for topic in paper.get('topics', []):
                # Add topic display name
                display_name = topic.get('display_name', '')
                if display_name:
                    entity_set.add(display_name.lower())
                # Add subfield for broader coverage
                subfield = topic.get('subfield', {}).get('display_name', '')
   
EntityLayerBuilder.build_from_papers method · python · L281-L326 (46 LOC)
src/papersift/entity_layer.py
    def build_from_papers(self, papers: List[Dict[str, Any]]) -> ig.Graph:
        """
        Build paper-paper graph via shared entities.

        Algorithm:
        1. Extract entities from each paper (title + optional topics)
        2. For each paper pair: edge weight = |shared entities|
        3. Create igraph with DOIs as node attributes

        Args:
            papers: List of paper dicts with 'doi' and 'title' fields

        Returns:
            igraph.Graph with DOI vertex attributes and weight edge attributes
        """
        # Step 1: Extract entities
        self._dois = []
        self._paper_entities = {}

        for paper in papers:
            doi = paper['doi']
            self._dois.append(doi)
            self._paper_entities[doi] = self._extract_entities_for_paper(paper)

        # Step 2: Compute edges
        n = len(self._dois)
        edges = []
        weights = []

        for i in range(n):
            doi1 = self._dois[i]
            ents1 = self._p
EntityLayerBuilder.run_leiden method · python · L328-L357 (30 LOC)
src/papersift/entity_layer.py
    def run_leiden(
        self,
        resolution: float = 1.0,
        seed: Optional[int] = None
    ) -> Dict[str, int]:
        """
        Run Leiden clustering with deterministic seed.

        Args:
            resolution: Higher = more clusters
            seed: Random seed for reproducibility

        Returns:
            {doi: cluster_id}
        """
        if self.graph is None:
            raise ValueError("Call build_from_papers() first")

        partition = leidenalg.find_partition(
            self.graph,
            leidenalg.RBConfigurationVertexPartition,
            resolution_parameter=resolution,
            weights='weight',
            seed=seed
        )

        return {
            self.graph.vs[i]['doi']: partition.membership[i]
            for i in range(len(self.graph.vs))
        }
Generated by Repobility's multi-pass static-analysis pipeline (https://repobility.com)
EntityLayerBuilder.get_cluster_summary method · python · L359-L402 (44 LOC)
src/papersift/entity_layer.py
    def get_cluster_summary(self, clusters: Dict[str, int]) -> List[Dict]:
        """
        Generate summary for each cluster.

        Args:
            clusters: Mapping of DOI to cluster_id from run_leiden()

        Returns:
            List of cluster summaries, sorted by size (largest first):
            {
                "cluster_id": int,
                "size": int,
                "dois": List[str],
                "top_entities": List[str]  # Most common entities (top 10)
            }
        """
        # Group by cluster
        cluster_members = defaultdict(list)
        for doi, cid in clusters.items():
            cluster_members[cid].append(doi)

        summaries = []
        for cid, dois in sorted(cluster_members.items()):
            # Count entities in this cluster
            entity_counts = defaultdict(int)
            for doi in dois:
                for ent in self._paper_entities.get(doi, []):
                    entity_counts[ent] += 1

            top_e
EntityLayerBuilder.find_hub_papers method · python · L411-L447 (37 LOC)
src/papersift/entity_layer.py
    def find_hub_papers(self, top_k: int = 10) -> List[Dict]:
        """
        Find Entity Hub Papers (Citation Major Paper alternative).

        Hub Score = weighted degree = sum of shared entities with all neighbors
        High hub score = paper shares many entities with many other papers

        Args:
            top_k: Number of top hub papers to return

        Returns:
            List of hub paper dicts (sorted by hub_score descending):
            {
                "doi": str,
                "hub_score": int,
                "entities": List[str] (top 10 entities)
            }
        """
        if self.graph is None:
            raise ValueError("Call build_from_papers() first")

        # Weighted degree = sum of edge weights
        scores = self.graph.strength(weights='weight')

        ranked = sorted(
            zip(self._dois, scores),
            key=lambda x: -x[1]
        )[:top_k]

        return [
            {
                'doi': doi,
                '
EntityLayerBuilder.find_papers_by_entity method · python · L449-L463 (15 LOC)
src/papersift/entity_layer.py
    def find_papers_by_entity(self, entity_name: str) -> List[str]:
        """
        Find papers containing a specific entity (Seed Paper alternative).

        Args:
            entity_name: Entity to search for (case-insensitive)

        Returns:
            List of DOIs containing this entity
        """
        entity_lower = entity_name.lower()
        return [
            doi for doi, entities in self._paper_entities.items()
            if entity_lower in entities
        ]
EntityLayerBuilder.expand_from_seed method · python · L465-L501 (37 LOC)
src/papersift/entity_layer.py
    def expand_from_seed(self, seed_doi: str, hops: int = 1) -> Set[str]:
        """
        Expand from seed paper via shared entities (Citation stream alternative).

        Like citation forward/backward traversal, but using entity connections.

        Args:
            seed_doi: Starting paper DOI
            hops: Number of expansion hops (1 = direct neighbors only)

        Returns:
            Set of DOIs reachable within hops (including seed)
        """
        if self.graph is None:
            raise ValueError("Call build_from_papers() first")

        if seed_doi not in self._dois:
            raise ValueError(f"Seed DOI not in graph: {seed_doi}")

        visited = {seed_doi}
        current = {seed_doi}

        for _ in range(hops):
            next_layer = set()
            for doi in current:
                idx = self._dois.index(doi)
                neighbors = self.graph.neighbors(idx)
                for n_idx in neighbors:
                    n_doi = self.graph.
EntityLayerBuilder.entity_stream method · python · L503-L564 (62 LOC)
src/papersift/entity_layer.py
    def entity_stream(
        self,
        start_doi: str,
        strategy: str = 'strongest',
        max_hops: int = 10
    ) -> List[str]:
        """
        Follow entity connections like citation stream traversal.

        Strategies:
        - 'strongest': Follow edge with highest weight (most shared entities)
        - 'diverse': Follow edge introducing most new entities

        Args:
            start_doi: Starting paper DOI
            strategy: 'strongest' or 'diverse'
            max_hops: Maximum path length

        Returns:
            Ordered path of DOIs from start
        """
        if self.graph is None:
            raise ValueError("Call build_from_papers() first")

        path = [start_doi]
        visited = {start_doi}
        current = start_doi

        for _ in range(max_hops):
            idx = self._dois.index(current)
            neighbors = self.graph.neighbors(idx)

            if not neighbors:
                break

            candidates = []
    
create_app function · python · L23-L146 (124 LOC)
src/papersift/ui/app.py
def create_app(papers_path: str, use_topics: bool = False) -> Dash:
    """
    Create and configure the Dash application.

    Args:
        papers_path: Path to papers JSON file
        use_topics: If True, use OpenAlex topics for enhanced clustering

    Returns:
        Configured Dash application
    """
    # Load and process data
    papers = load_papers(papers_path)

    # Detect topics presence and override use_topics if data has topics
    has_topics = any('topics' in p and p['topics'] for p in papers)
    if has_topics and not use_topics:
        use_topics = has_topics

    clusters, builder = cluster_papers(papers, resolution=1.0, use_topics=use_topics)
    elements = papers_to_cytoscape_elements(papers, clusters, builder)
    rows = papers_to_table_data(papers, clusters)
    colors = generate_cluster_colors(set(clusters.values()))

    # Compute embedding for landscape (standalone, no builder needed)
    embedding = compute_paper_embedding(papers, method="tsne", use_topic
run_server function · python · L149-L169 (21 LOC)
src/papersift/ui/app.py
def run_server(
    papers_path: str,
    port: int = 8050,
    debug: bool = False,
    host: str = "127.0.0.1",
    use_topics: bool = False,
):
    """
    Run the Dash server.

    Args:
        papers_path: Path to papers JSON file
        port: Server port (default 8050)
        debug: Enable debug mode
        host: Server host (default 127.0.0.1, use 0.0.0.0 for external access)
        use_topics: If True, use OpenAlex topics for enhanced clustering
    """
    app = create_app(papers_path, use_topics=use_topics)
    url = f"http://{host}:{port}" if host != "0.0.0.0" else f"http://0.0.0.0:{port} (accessible externally)"
    print(f"Starting PaperSift UI at {url}")
    app.run(debug=debug, port=port, host=host)
_push_checkpoint function · python · L7-L19 (13 LOC)
src/papersift/ui/callbacks/clustering.py
def _push_checkpoint(history, checkpoint):
    """Push a checkpoint onto the history stack."""
    h = dict(history)
    checkpoints = list(h.get('checkpoints', []))
    max_size = h.get('max_size', 20)

    checkpoints.append(checkpoint)
    if len(checkpoints) > max_size:
        checkpoints = checkpoints[-max_size:]

    h['checkpoints'] = checkpoints
    h['current_index'] = len(checkpoints) - 1
    return h
Want fix-PRs on findings? Install Repobility's GitHub App · github.com/apps/repobility-bot
register_history_callbacks function · python · L6-L81 (76 LOC)
src/papersift/ui/callbacks/history.py
def register_history_callbacks(app):
    """Register undo and history display callbacks."""

    # Undo button
    @app.callback(
        Output('papers-data', 'data', allow_duplicate=True),
        Output('cluster-data', 'data', allow_duplicate=True),
        Output('cytoscape-network', 'elements', allow_duplicate=True),
        Output('paper-table', 'rowData', allow_duplicate=True),
        Output('cluster-colors', 'data', allow_duplicate=True),
        Output('selection-store', 'data', allow_duplicate=True),
        Output('embedding-data', 'data', allow_duplicate=True),
        Output('navigation-state', 'data', allow_duplicate=True),
        Output('history-stack', 'data', allow_duplicate=True),
        Output('breadcrumb-container', 'children', allow_duplicate=True),
        Input('undo-btn', 'n_clicks'),
        State('history-stack', 'data'),
        State('original-papers', 'data'),
        State('resolution-slider', 'value'),
        State('use-topics-flag', 'data'),
        
_push_checkpoint function · python · L215-L227 (13 LOC)
src/papersift/ui/callbacks/navigation.py
def _push_checkpoint(history, checkpoint):
    """Push a checkpoint onto the history stack."""
    h = dict(history)
    checkpoints = list(h.get('checkpoints', []))
    max_size = h.get('max_size', 20)

    checkpoints.append(checkpoint)
    if len(checkpoints) > max_size:
        checkpoints = checkpoints[-max_size:]

    h['checkpoints'] = checkpoints
    h['current_index'] = len(checkpoints) - 1
    return h
register_selection_callbacks function · python · L6-L80 (75 LOC)
src/papersift/ui/callbacks/selection.py
def register_selection_callbacks(app):
    """Register all selection-related callbacks."""

    # Network selection -> Store
    @app.callback(
        Output('selection-store', 'data', allow_duplicate=True),
        Input('cytoscape-network', 'selectedNodeData'),
        prevent_initial_call=True
    )
    def network_to_store(selected_nodes):
        if selected_nodes is None:
            return {'selected_dois': [], 'source': 'network'}
        dois = [node['id'] for node in selected_nodes]
        return {'selected_dois': dois, 'source': 'network'}

    # Table selection -> Store
    @app.callback(
        Output('selection-store', 'data', allow_duplicate=True),
        Input('paper-table', 'selectedRows'),
        prevent_initial_call=True
    )
    def table_to_store(selected_rows):
        if selected_rows is None:
            return {'selected_dois': [], 'source': 'table'}
        dois = [row['doi'] for row in selected_rows]
        return {'selected_dois': dois, 'source': 'tab
create_breadcrumb function · python · L6-L38 (33 LOC)
src/papersift/ui/components/breadcrumb.py
def create_breadcrumb(path: list) -> html.Div:
    """
    Render breadcrumb navigation showing the drill-down hierarchy.

    Args:
        path: list of cluster IDs forming the drill-down path
              [] -> "All Papers"
              [3] -> "All Papers > Cluster 3"
              [3, 1] -> "All Papers > Cluster 3 > Sub 3.1"

    Returns:
        html.Div with breadcrumb trail
    """
    items = [html.Span('All Papers', style={'fontWeight': 'bold'})]

    for i, cid in enumerate(path):
        items.append(html.Span(' > ', style={'color': '#999', 'margin': '0 4px'}))
        label = f'Cluster {cid}' if i == 0 else f'Sub {".".join(str(x) for x in path[:i+1])}'
        if i == len(path) - 1:
            # Current level (not clickable)
            items.append(html.Span(label, style={'fontWeight': 'bold', 'color': '#6f42c1'}))
        else:
            items.append(html.Span(label, style={'color': '#007bff'}))

    return html.Div(
        children=items,
        style={
          
create_landscape_figure function · python · L7-L63 (57 LOC)
src/papersift/ui/components/landscape.py
def create_landscape_figure(
    embedding_data: Dict[str, list],
    clusters: Dict[str, Any],
    colors: Dict[Any, str],
    papers: List[Dict[str, Any]],
):
    """Create Plotly scatter figure for paper landscape.

    Args:
        embedding_data: {doi: [x, y]} coordinates
        clusters: {doi: cluster_id}
        colors: {cluster_id: hex_color}
        papers: paper list for hover info

    Returns:
        plotly Figure
    """
    import plotly.graph_objects as go

    doi_to_paper = {p['doi']: p for p in papers}

    # Group DOIs by cluster
    cluster_dois = {}
    for doi, cid in clusters.items():
        cluster_dois.setdefault(cid, []).append(doi)

    fig = go.Figure()
    for cid in sorted(cluster_dois.keys(), key=str):
        dois = cluster_dois[cid]
        xs = [embedding_data[d][0] for d in dois if d in embedding_data]
        ys = [embedding_data[d][1] for d in dois if d in embedding_data]
        titles = [doi_to_paper.get(d, {}).get('title', d)[:60] for d in do
create_landscape_component function · python · L66-L95 (30 LOC)
src/papersift/ui/components/landscape.py
def create_landscape_component(
    embedding_data: Dict[str, list],
    clusters: Dict[str, Any],
    colors: Dict[Any, str],
    papers: List[Dict[str, Any]],
) -> html.Div:
    """Create Dash component containing the landscape scatter plot.

    Args:
        embedding_data: {doi: [x, y]} coordinates
        clusters: {doi: cluster_id}
        colors: {cluster_id: hex_color}
        papers: paper list for hover info

    Returns:
        html.Div with dcc.Graph
    """
    fig = create_landscape_figure(embedding_data, clusters, colors, papers)

    return html.Div(
        id='landscape-container',
        children=[
            dcc.Graph(
                id='landscape-scatter',
                figure=fig,
                config={'displayModeBar': True, 'scrollZoom': True},
                style={'height': '600px'},
            ),
        ]
    )
create_network_component function · python · L7-L52 (46 LOC)
src/papersift/ui/components/network.py
def create_network_component(elements: list, stylesheet: list = None) -> html.Div:
    """
    Create Cytoscape network component with box selection enabled.

    Args:
        elements: List of node/edge elements
        stylesheet: Optional custom stylesheet

    Returns:
        Dash Div containing the Cytoscape component
    """
    if stylesheet is None:
        stylesheet = get_default_stylesheet()

    return html.Div([
        cyto.Cytoscape(
            id='cytoscape-network',
            elements=elements,
            stylesheet=stylesheet,
            layout={
                'name': 'cose',
                'animate': False,
                'nodeRepulsion': 8000,
                'idealEdgeLength': 100,
                'edgeElasticity': 100,
                'nestingFactor': 0.1,
                'gravity': 0.25,
                'numIter': 500,
                'initialTemp': 200,
                'coolingFactor': 0.95,
                'minTemp': 1.0,
            },
            s
get_default_stylesheet function · python · L55-L98 (44 LOC)
src/papersift/ui/components/network.py
def get_default_stylesheet() -> list:
    """
    Get default stylesheet for Cytoscape.

    Nodes are colored by cluster, edges are gray.
    Selected nodes have a thick border.
    """
    return [
        # Base node style
        {
            'selector': 'node',
            'style': {
                'label': 'data(label)',
                'background-color': 'data(color)',
                'width': 20,
                'height': 20,
                'font-size': '8px',
                'text-valign': 'bottom',
                'text-halign': 'center',
                'text-wrap': 'ellipsis',
                'text-max-width': '80px',
            }
        },
        # Selected node style
        {
            'selector': 'node:selected',
            'style': {
                'border-width': 3,
                'border-color': '#000',
                'width': 30,
                'height': 30,
            }
        },
        # Base edge style
        {
            'selector': 'edge',
  
Source: Repobility analyzer · https://repobility.com
_escape_doi_for_selector function · python · L101-L107 (7 LOC)
src/papersift/ui/components/network.py
def _escape_doi_for_selector(doi: str) -> str:
    """Escape DOI for use in CSS selector."""
    # Escape backslash first, then other special chars
    result = doi
    for char in ['\\', '"', "'", '[', ']', '/', '.', ':']:
        result = result.replace(char, f'\\{char}')
    return result
get_highlight_stylesheet function · python · L110-L139 (30 LOC)
src/papersift/ui/components/network.py
def get_highlight_stylesheet(
    base_stylesheet: list,
    selected_dois: list,
) -> list:
    """
    Generate stylesheet with selected nodes highlighted.

    Args:
        base_stylesheet: Default stylesheet
        selected_dois: List of selected DOIs

    Returns:
        Updated stylesheet
    """
    stylesheet = base_stylesheet.copy()

    # Add highlight rules for selected nodes
    for doi in selected_dois:
        escaped_doi = _escape_doi_for_selector(doi)
        stylesheet.append({
            'selector': f'node[id = "{escaped_doi}"]',
            'style': {
                'border-width': 3,
                'border-color': '#ff0000',
                'width': 30,
                'height': 30,
            }
        })

    return stylesheet
create_sidebar function · python · L6-L176 (171 LOC)
src/papersift/ui/components/sidebar.py
def create_sidebar() -> html.Div:
    """
    Create sidebar with controls for filtering, navigation, and re-clustering.

    Contains:
    - Resolution slider
    - Selection actions (Keep/Exclude/Reset)
    - Navigation (Drill-down/Back)
    - Undo
    - Export
    - Statistics
    """
    return html.Div([
        html.H3('Controls', style={'marginBottom': '20px'}),

        # Resolution slider
        html.Div([
            html.Label('Cluster Resolution'),
            dcc.Slider(
                id='resolution-slider',
                min=0.1,
                max=3.0,
                step=0.1,
                value=1.0,
                marks={0.5: '0.5', 1.0: '1.0', 2.0: '2.0', 3.0: '3.0'},
                tooltip={'placement': 'bottom', 'always_visible': True},
                updatemode='mouseup',
            ),
            html.Small('Higher = more clusters', style={'color': '#666'})
        ], style={'marginBottom': '30px'}),

        # Selection actions
        html.Div([
   
page 1 / 2next ›