Function bodies 69 total
run_enrich function · python · L221-L241 (21 LOC)src/papersift/cli.py
def run_enrich(args):
"""Execute enrich command."""
try:
from papersift.enrich import OpenAlexEnricher
except ImportError:
print("Error: enrichment requires pyalex. Install with: pip install papersift[enrich]",
file=sys.stderr)
sys.exit(1)
papers = load_papers(args.input)
fields = [f.strip() for f in args.fields.split(',')]
enricher = OpenAlexEnricher(email=args.email)
enriched = enricher.enrich_papers(papers, fields=fields)
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w') as f:
json.dump(enriched, f, indent=2)
print(f"Saved: {output_path}")run_find function · python · L244-L292 (49 LOC)src/papersift/cli.py
def run_find(args):
"""Execute find command."""
from papersift import EntityLayerBuilder
papers = load_papers(args.input)
use_topics = getattr(args, 'use_topics', False)
builder = EntityLayerBuilder(use_topics=use_topics)
builder.build_from_papers(papers)
if args.hubs:
hubs = builder.find_hub_papers(top_k=args.hubs)
if args.format == "json":
# JSON output
output = []
for h in hubs:
title = get_title(papers, h['doi'])
output.append({
'doi': h['doi'],
'title': title,
'hub_score': h['hub_score'],
'entities': h['entities']
})
print(json.dumps(output, indent=2))
else:
# Table output (default)
print(f"Top {args.hubs} Entity Hub Papers:")
print("-" * 60)
for i, h in enumerate(hubs, 1):
title = get_title(run_stream function · python · L295-L334 (40 LOC)src/papersift/cli.py
def run_stream(args):
"""Execute stream command."""
from papersift import EntityLayerBuilder
papers = load_papers(args.input)
use_topics = getattr(args, 'use_topics', False)
builder = EntityLayerBuilder(use_topics=use_topics)
builder.build_from_papers(papers)
if args.expand:
reachable = builder.expand_from_seed(args.seed, hops=args.hops)
if args.format == "json":
# JSON output
output = []
for doi in reachable:
title = get_title(papers, doi)
output.append({'doi': doi, 'title': title})
print(json.dumps(output, indent=2))
else:
# Table output (default)
print(f"Papers reachable in {args.hops} hops from seed: {len(reachable)}")
for doi in list(reachable)[:20]:
title = get_title(papers, doi)
print(f" - {title[:60]}...")
else:
path = builder.entity_stream(args.seed, strategy=args.sload_papers function · python · L337-L358 (22 LOC)src/papersift/cli.py
def load_papers(path):
"""Load papers from file or stdin.
Args:
path: File path or "-" for stdin
Returns:
List of paper dicts
"""
if path == "-":
if sys.stdin.isatty():
print("Error: No input on stdin. Use '-' only when piping data.", file=sys.stderr)
sys.exit(1)
try:
data = json.load(sys.stdin)
except json.JSONDecodeError:
print("Error: Invalid JSON input on stdin", file=sys.stderr)
sys.exit(1)
else:
with open(path) as f:
data = json.load(f)
return data.get('papers', data) if isinstance(data, dict) else datarun_cluster function · python · L365-L436 (72 LOC)src/papersift/cli.py
def run_cluster(args):
"""Execute clustering command."""
from papersift import EntityLayerBuilder, ClusterValidator
# Load papers
papers = load_papers(args.input)
print(f"Loaded {len(papers)} papers")
# Build entity graph and cluster
use_topics = getattr(args, 'use_topics', False)
mode = "Title + OpenAlex Topics" if use_topics else "Title-only"
print(f"Building entity graph ({mode})...")
builder = EntityLayerBuilder(use_topics=use_topics)
builder.build_from_papers(papers)
print(f" Graph: {builder.graph.vcount()} nodes, {builder.graph.ecount()} edges")
print(f"Running Leiden clustering (resolution={args.resolution}, seed={args.seed})...")
clusters = builder.run_leiden(resolution=args.resolution, seed=args.seed)
num_clusters = len(set(clusters.values()))
print(f" Found {num_clusters} clusters")
# Generate summaries
summaries = builder.get_cluster_summary(clusters)
# Create output directory
output_dir =run_ui function · python · L439-L461 (23 LOC)src/papersift/cli.py
def run_ui(args):
"""Launch interactive UI or export static HTML."""
# Mutual exclusion check
if args.export and args.host != "127.0.0.1":
print("Error: --export and --host cannot be used together.", file=sys.stderr)
print(" --export generates a static file, no server is started.", file=sys.stderr)
sys.exit(1)
if args.export:
# Export mode
from papersift.ui.exporter import export_network_html
export_network_html(args.input, args.export, mode=getattr(args, 'mode', 'cluster'))
else:
# Server mode
try:
from papersift.ui.app import run_server
except ImportError:
print("Error: UI requires additional dependencies. Install with: pip install -r requirements-ui.txt",
file=sys.stderr)
sys.exit(1)
run_server(args.input, port=args.port, debug=args.debug, host=args.host,
use_topics=getattr(args, 'use_topics', False))run_browse function · python · L464-L534 (71 LOC)src/papersift/cli.py
def run_browse(args):
"""Browse cluster contents in text mode."""
from papersift import EntityLayerBuilder
papers = load_papers(args.input)
use_topics = getattr(args, 'use_topics', False)
builder = EntityLayerBuilder(use_topics=use_topics)
builder.build_from_papers(papers)
clusters = builder.run_leiden(resolution=args.resolution, seed=42)
summaries = builder.get_cluster_summary(clusters)
# Sort by size descending
summaries.sort(key=lambda s: s['size'], reverse=True)
# Default to --list if no specific action
if not args.cluster and not args.export:
args.list = True
if args.list:
_browse_list(summaries, len(papers), args.format)
if args.cluster:
cluster_ids = [int(x.strip()) for x in args.cluster.split(',')]
_browse_detail(summaries, cluster_ids, papers, args.full, args.format)
if args.export:
_browse_export(summaries, cluster_ids, papers, args.export)
if getattr(args, 'Want fix-PRs on findings? Install Repobility's GitHub App · github.com/apps/repobility-bot
_browse_list function · python · L537-L554 (18 LOC)src/papersift/cli.py
def _browse_list(summaries, total_papers, format_type="table"):
"""Print cluster list summary."""
if format_type == "json":
# JSON output
output = []
for s in summaries:
output.append({
'cluster_id': s['cluster_id'],
'size': s['size'],
'top_entities': s['top_entities']
})
print(json.dumps(output, indent=2))
else:
# Table output (default)
print(f"{len(summaries)} clusters found ({total_papers} papers total)\n")
for s in summaries:
entities = ', '.join(s['top_entities'][:5])
print(f"Cluster {s['cluster_id']} ({s['size']} papers): {entities}")_browse_detail function · python · L557-L608 (52 LOC)src/papersift/cli.py
def _browse_detail(summaries, cluster_ids, papers, full=False, format_type="table"):
"""Print detailed cluster info."""
summary_map = {s['cluster_id']: s for s in summaries}
if format_type == "json":
# JSON output
output = []
for cid in cluster_ids:
if cid not in summary_map:
continue
s = summary_map[cid]
# Get sample papers with details
sample_papers = []
for doi in s['dois'][:3]:
title = get_title(papers, doi)
year = next((p.get('year', '?') for p in papers if p['doi'] == doi), '?')
sample_papers.append({'doi': doi, 'title': title, 'year': year})
output.append({
'cluster_id': cid,
'size': s['size'],
'top_entities': s['top_entities'],
'sample_papers': sample_papers,
'dois': s['dois'] if full else s['dois'][:10]
})
p_browse_export function · python · L611-L625 (15 LOC)src/papersift/cli.py
def _browse_export(summaries, cluster_ids, papers, output_path):
"""Export selected clusters to JSON."""
summary_map = {s['cluster_id']: s for s in summaries}
selected_dois = set()
for cid in cluster_ids:
if cid in summary_map:
selected_dois.update(summary_map[cid]['dois'])
selected_papers = [p for p in papers if p['doi'] in selected_dois]
with open(output_path, 'w') as f:
json.dump(selected_papers, f, indent=2)
print(f"Selected {len(cluster_ids)} clusters ({len(selected_papers)} papers total)")
print(f"Exported to: {output_path}")run_landscape function · python · L628-L726 (99 LOC)src/papersift/cli.py
def run_landscape(args):
"""Generate landscape visualization as HTML."""
from papersift.embedding import embed_papers
from papersift import EntityLayerBuilder
papers = load_papers(args.input)
use_topics = getattr(args, 'use_topics', False)
print(f"Loaded {len(papers)} papers", file=sys.stderr)
# Cluster for coloring
builder = EntityLayerBuilder(use_topics=use_topics)
builder.build_from_papers(papers)
clusters = builder.run_leiden(resolution=args.resolution, seed=args.seed)
num_clusters = len(set(clusters.values()))
print(f"Found {num_clusters} clusters", file=sys.stderr)
# Compute embedding
print(f"Computing {args.method.upper()} embedding...", file=sys.stderr)
# Auto-adjust perplexity for t-SNE with small sample sizes
kwargs = {}
if args.method == "tsne":
max_perplexity = (len(papers) - 1) / 3.0
if max_perplexity < 30.0:
kwargs['perplexity'] = max(5.0, max_perplexity)
print(run_filter function · python · L729-L824 (96 LOC)src/papersift/cli.py
def run_filter(args):
"""Filter papers by entity, cluster, or DOI list."""
from papersift import EntityLayerBuilder
papers = load_papers(args.input)
matching_dois = set(p['doi'] for p in papers) # Start with all
# Entity filter
if args.entity:
use_topics = getattr(args, 'use_topics', False)
builder = EntityLayerBuilder(use_topics=use_topics)
builder.build_from_papers(papers)
entity_matches = []
for entity_name in args.entity:
found = set(builder.find_papers_by_entity(entity_name))
entity_matches.append(found)
if getattr(args, 'entity_any', False):
# OR: union
entity_set = set()
for s in entity_matches:
entity_set.update(s)
else:
# AND: intersection
entity_set = entity_matches[0]
for s in entity_matches[1:]:
entity_set &= s
matching_dois &= entity_set
# Cluster firun_merge function · python · L827-L849 (23 LOC)src/papersift/cli.py
def run_merge(args):
"""Merge multiple paper JSON files, deduplicate by DOI."""
all_papers = []
seen_dois = set()
for input_path in args.inputs:
papers = load_papers(input_path)
for p in papers:
if p['doi'] not in seen_dois:
all_papers.append(p)
seen_dois.add(p['doi'])
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w') as f:
json.dump(all_papers, f, indent=2)
total_input = sum(len(load_papers(p)) for p in args.inputs)
deduped = total_input - len(all_papers)
print(f"Merged {len(args.inputs)} files: {total_input} papers -> {len(all_papers)} unique", file=sys.stderr)
if deduped > 0:
print(f" Removed {deduped} duplicates", file=sys.stderr)
print(f"Saved: {output_path}", file=sys.stderr)run_subcluster function · python · L852-L913 (62 LOC)src/papersift/cli.py
def run_subcluster(args):
"""Sub-cluster a specific cluster using the standalone sub_cluster function."""
from papersift.embedding import sub_cluster
papers = load_papers(args.input)
with open(args.clusters_from) as f:
clusters = json.load(f)
use_topics = getattr(args, 'use_topics', False)
target_cid = args.cluster
# Try to match as int if possible (clusters.json values are often ints)
try:
target_cid_int = int(target_cid)
if any(v == target_cid_int for v in clusters.values()):
target_cid = target_cid_int
except ValueError:
pass
try:
sub_results = sub_cluster(
papers, target_cid, clusters,
resolution=args.resolution,
seed=args.seed,
use_topics=use_topics,
)
except ValueError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
# Count sub-clusters
from collections import Counter
sub_counts = Counter(extract_paper_entities function · python · L20-L35 (16 LOC)src/papersift/embedding.py
def extract_paper_entities(
papers: List[Dict[str, Any]],
use_topics: bool = False,
) -> Dict[str, set]:
"""Extract entity sets for each paper using a temporary EntityLayerBuilder.
Args:
papers: List of paper dicts with 'doi' and 'title' fields.
use_topics: If True, also use OpenAlex topics as entities.
Returns:
Mapping of DOI to set of lowercase entity names.
"""
builder = EntityLayerBuilder(use_topics=use_topics)
builder.build_from_papers(papers)
return builder.paper_entitiesSource: Repobility analyzer · https://repobility.com
build_entity_matrix function · python · L38-L75 (38 LOC)src/papersift/embedding.py
def build_entity_matrix(
papers: List[Dict[str, Any]],
paper_entities: Dict[str, set],
) -> Tuple[np.ndarray, List[str], List[str]]:
"""Build a binary entity-presence matrix.
Rows correspond to papers (ordered by DOI appearance in *papers*),
columns correspond to unique entities (sorted alphabetically).
Papers with no entities receive an all-zero row.
Args:
papers: List of paper dicts (used for DOI ordering).
paper_entities: Mapping of DOI to entity set (from extract_paper_entities).
Returns:
Tuple of (matrix, doi_list, entity_list) where matrix has shape
(n_papers, n_entities) and dtype float32.
"""
doi_list = [p["doi"] for p in papers]
# Collect all unique entities across all papers
all_entities: Set[str] = set()
for entities in paper_entities.values():
all_entities.update(entities)
entity_list = sorted(all_entities)
entity_index = {ent: i for i, ent in enumerate(entity_list)}
compute_embedding function · python · L78-L133 (56 LOC)src/papersift/embedding.py
def compute_embedding(
matrix: np.ndarray,
method: str = "umap",
n_components: int = 2,
random_state: int = 42,
**kwargs: Any,
) -> np.ndarray:
"""Reduce an entity-presence matrix to a low-dimensional embedding.
Args:
matrix: 2-D array of shape (n_papers, n_entities).
method: ``"umap"`` or ``"tsne"``.
n_components: Target dimensionality (default 2).
random_state: Seed for reproducibility.
**kwargs: Forwarded to the underlying reducer constructor.
Returns:
ndarray of shape (n_papers, n_components).
Raises:
ValueError: If matrix has fewer than 2 rows.
ImportError: If method is ``"umap"`` and umap-learn is not installed.
ValueError: If method is not ``"umap"`` or ``"tsne"``.
"""
if matrix.shape[0] < 2:
raise ValueError(
f"compute_embedding requires at least 2 rows, got {matrix.shape[0]}"
)
if method == "umap":
try:
impoembed_papers function · python · L136-L191 (56 LOC)src/papersift/embedding.py
def embed_papers(
papers: List[Dict[str, Any]],
method: str = "umap",
use_topics: bool = False,
random_state: int = 42,
**kwargs: Any,
) -> Dict[str, Tuple[float, float]]:
"""High-level: papers in, {doi: (x, y)} out.
Internally chains extract_paper_entities -> build_entity_matrix ->
compute_embedding, then maps coordinates back to DOIs.
Papers whose entity set is empty are placed at the centroid of the
embedding with small random jitter so they remain visible but do not
distort the layout.
Args:
papers: List of paper dicts with 'doi' and 'title' fields.
method: ``"umap"`` or ``"tsne"``.
use_topics: If True, also use OpenAlex topics as entities.
random_state: Seed for reproducibility.
**kwargs: Forwarded to compute_embedding.
Returns:
Mapping of DOI to (x, y) coordinate tuple.
"""
paper_entities = extract_paper_entities(papers, use_topics=use_topics)
matrix, doi_list, entisub_cluster function · python · L194-L261 (68 LOC)src/papersift/embedding.py
def sub_cluster(
papers: List[Dict[str, Any]],
cluster_id: Union[int, str],
clusters: Dict[str, Union[int, str]],
resolution: float = 1.0,
seed: Optional[int] = None,
use_topics: bool = False,
) -> Dict[str, str]:
"""Hierarchical sub-clustering within an existing cluster.
Filters papers to those belonging to *cluster_id*, builds a new entity
graph from the subset, runs Leiden, and returns membership with
hierarchical IDs of the form ``"{cluster_id}.{sub_id}"``.
If only one paper belongs to the cluster, or Leiden finds only a single
sub-cluster, the original cluster_id is returned unchanged (as a string).
Args:
papers: Full list of paper dicts.
cluster_id: The cluster to sub-divide.
clusters: Existing DOI -> cluster_id mapping.
resolution: Leiden resolution for the sub-clustering.
seed: Random seed for Leiden.
use_topics: If True, also use OpenAlex topics as entities.
Returns:
OpenAlexEnricher.__init__ method · python · L28-L34 (7 LOC)src/papersift/enrich.py
def __init__(self, email: str):
"""
Args:
email: Contact email for OpenAlex polite pool (faster rate limits).
"""
pyalex.config.email = email
self.email = emailOpenAlexEnricher.enrich_papers method · python · L36-L106 (71 LOC)src/papersift/enrich.py
def enrich_papers(
self,
papers: List[Dict[str, Any]],
fields: Optional[List[str]] = None,
progress: bool = True,
) -> List[Dict[str, Any]]:
"""
Enrich papers with OpenAlex data.
Args:
papers: List of paper dicts with 'doi' key.
fields: Fields to fetch. Default: ['referenced_works', 'openalex_id'].
Supported: 'referenced_works', 'openalex_id', 'topics', 'abstract'.
progress: Show progress output.
Returns:
Papers list with requested fields added.
"""
if fields is None:
fields = ['referenced_works', 'openalex_id']
papers_with_doi = [(i, p) for i, p in enumerate(papers) if p.get('doi')]
total = len(papers_with_doi)
if progress:
print(f"Enriching {total} papers (of {len(papers)} total)...")
enriched_count = 0
for idx, (i, paper) in enumerate(papers_with_doi):
OpenAlexEnricher._fetch_work method · python · L108-L120 (13 LOC)src/papersift/enrich.py
def _fetch_work(self, doi: str) -> Optional[Dict]:
"""Fetch a single work from OpenAlex by DOI."""
try:
# Normalize DOI to URL format for OpenAlex
if not doi.startswith('http'):
doi_url = f"https://doi.org/{doi}"
else:
doi_url = doi
work = Works()[doi_url]
return work
except Exception:
return NoneOpenAlexEnricher._resolve_openalex_ids_to_dois method · python · L122-L157 (36 LOC)src/papersift/enrich.py
def _resolve_openalex_ids_to_dois(self, openalex_ids: List[str]) -> List[str]:
"""
Batch-resolve OpenAlex work IDs to DOIs.
Args:
openalex_ids: List of OpenAlex URLs like 'https://openalex.org/W1234567'.
Returns:
List of DOIs (strings). IDs that can't be resolved are omitted.
"""
dois = []
# Process in batches
for batch_start in range(0, len(openalex_ids), self.BATCH_SIZE):
batch = openalex_ids[batch_start:batch_start + self.BATCH_SIZE]
# Build pipe-separated filter
id_filter = "|".join(batch)
try:
results = Works().filter(openalex_id=id_filter).get()
for work in results:
doi = work.get('doi')
if doi:
# OpenAlex returns DOIs as URLs, strip prefix
if doi.startswith('https://doi.org/'):
doi = doi[lenMethodology: Repobility · https://repobility.com/research/state-of-ai-code-2026/
OpenAlexEnricher._reconstruct_abstract method · python · L160-L173 (14 LOC)src/papersift/enrich.py
def _reconstruct_abstract(work: Dict) -> Optional[str]:
"""Reconstruct abstract from OpenAlex inverted index format."""
inv_index = work.get('abstract_inverted_index')
if not inv_index:
return None
# Inverted index: {word: [positions]}
word_positions = []
for word, positions in inv_index.items():
for pos in positions:
word_positions.append((pos, word))
word_positions.sort(key=lambda x: x[0])
return " ".join(word for _, word in word_positions)ImprovedEntityExtractor.__init__ method · python · L60-L134 (75 LOC)src/papersift/entity_layer.py
def __init__(self):
# Methods - use word boundaries
self.methods = [
'scGPT', 'transformer', 'transformers', 'LSTM', 'CNN', 'RNN', 'GRU',
'neural network', 'deep learning', 'machine learning', 'ML', 'DL', 'AI',
'random forest', 'support vector', 'SVM', 'clustering', 'k-means',
'classification', 'regression', 'ensemble', 'boosting', 'XGBoost',
'reinforcement learning', 'RL',
'GAN', 'VAE', 'autoencoder', 'diffusion model',
'attention mechanism', 'self-attention', 'BERT', 'GPT', 'LLM',
'foundation model', 'language model', 'embedding', 'representation learning',
'transfer learning', 'fine-tuning', 'pre-training', 'pretraining',
'zero-shot', 'few-shot', 'contrastive learning',
'graph neural network', 'GNN', 'graph convolutional', 'GCN',
'message passing', 'node embedding',
'simulation', 'optimization', 'algorithm',
ImprovedEntityExtractor._compile_patterns method · python · L136-L161 (26 LOC)src/papersift/entity_layer.py
def _compile_patterns(self):
"""Pre-compile regex patterns for efficient matching."""
self.method_patterns = []
for method in self.methods:
# Escape special chars and add word boundaries
pattern = re.compile(r'\b' + re.escape(method.lower()) + r'\b', re.IGNORECASE)
self.method_patterns.append((method, pattern))
self.organism_patterns = []
for organism in self.organisms:
pattern = re.compile(r'\b' + re.escape(organism.lower()) + r'\b', re.IGNORECASE)
self.organism_patterns.append((organism, pattern))
# Add 'rat' separately with strict word boundary (not in 'generative', etc.)
# \brat\b will only match standalone "rat"
self.organism_patterns.append(('rat', re.compile(r'\brat\b', re.IGNORECASE)))
self.concept_patterns = []
for concept in self.concepts:
pattern = re.compile(r'\b' + re.escape(concept.lower()) + r'\b', re.IGNORECASE)
ImprovedEntityExtractor.extract_entities method · python · L163-L225 (63 LOC)src/papersift/entity_layer.py
def extract_entities(self, title: str, category: str) -> List[Dict[str, str]]:
"""
Extract entities from title using word-boundary regex matching.
Args:
title: Paper title to extract entities from
category: Paper category (currently unused, for future expansion)
Returns:
List of {"name": str, "type": str} dicts
"""
entities = []
seen = set()
# Extract methods
for method, pattern in self.method_patterns:
if pattern.search(title):
key = method.lower()
if key not in seen:
entities.append({"name": method, "type": "METHOD"})
seen.add(key)
# Extract organisms
for organism, pattern in self.organism_patterns:
if pattern.search(title):
key = organism.lower()
if key not in seen:
entities.append({"name": organism, "type":EntityLayerBuilder.__init__ method · python · L236-L248 (13 LOC)src/papersift/entity_layer.py
def __init__(self, use_topics: bool = False):
"""
Initialize the entity layer builder.
Args:
use_topics: If True, also use OpenAlex topics from paper['topics'] as entities.
Requires enriched paper data with 'topics' field.
"""
self.extractor = ImprovedEntityExtractor()
self.use_topics = use_topics
self.graph: Optional[ig.Graph] = None
self._paper_entities: Dict[str, set] = {} # doi -> set(entity_names)
self._dois: List[str] = []EntityLayerBuilder._extract_entities_for_paper method · python · L250-L279 (30 LOC)src/papersift/entity_layer.py
def _extract_entities_for_paper(self, paper: Dict[str, Any]) -> Set[str]:
"""
Extract entities from a paper, optionally including topics.
Args:
paper: Paper dict with 'title', optional 'category', optional 'topics'
Returns:
Set of lowercase entity names
"""
# Rule-based entities from title
entities = self.extractor.extract_entities(
paper['title'],
paper.get('category', '')
)
entity_set = {e['name'].lower() for e in entities}
# Add OpenAlex topics if enabled
if self.use_topics:
for topic in paper.get('topics', []):
# Add topic display name
display_name = topic.get('display_name', '')
if display_name:
entity_set.add(display_name.lower())
# Add subfield for broader coverage
subfield = topic.get('subfield', {}).get('display_name', '')
EntityLayerBuilder.build_from_papers method · python · L281-L326 (46 LOC)src/papersift/entity_layer.py
def build_from_papers(self, papers: List[Dict[str, Any]]) -> ig.Graph:
"""
Build paper-paper graph via shared entities.
Algorithm:
1. Extract entities from each paper (title + optional topics)
2. For each paper pair: edge weight = |shared entities|
3. Create igraph with DOIs as node attributes
Args:
papers: List of paper dicts with 'doi' and 'title' fields
Returns:
igraph.Graph with DOI vertex attributes and weight edge attributes
"""
# Step 1: Extract entities
self._dois = []
self._paper_entities = {}
for paper in papers:
doi = paper['doi']
self._dois.append(doi)
self._paper_entities[doi] = self._extract_entities_for_paper(paper)
# Step 2: Compute edges
n = len(self._dois)
edges = []
weights = []
for i in range(n):
doi1 = self._dois[i]
ents1 = self._pEntityLayerBuilder.run_leiden method · python · L328-L357 (30 LOC)src/papersift/entity_layer.py
def run_leiden(
self,
resolution: float = 1.0,
seed: Optional[int] = None
) -> Dict[str, int]:
"""
Run Leiden clustering with deterministic seed.
Args:
resolution: Higher = more clusters
seed: Random seed for reproducibility
Returns:
{doi: cluster_id}
"""
if self.graph is None:
raise ValueError("Call build_from_papers() first")
partition = leidenalg.find_partition(
self.graph,
leidenalg.RBConfigurationVertexPartition,
resolution_parameter=resolution,
weights='weight',
seed=seed
)
return {
self.graph.vs[i]['doi']: partition.membership[i]
for i in range(len(self.graph.vs))
}Generated by Repobility's multi-pass static-analysis pipeline (https://repobility.com)
EntityLayerBuilder.get_cluster_summary method · python · L359-L402 (44 LOC)src/papersift/entity_layer.py
def get_cluster_summary(self, clusters: Dict[str, int]) -> List[Dict]:
"""
Generate summary for each cluster.
Args:
clusters: Mapping of DOI to cluster_id from run_leiden()
Returns:
List of cluster summaries, sorted by size (largest first):
{
"cluster_id": int,
"size": int,
"dois": List[str],
"top_entities": List[str] # Most common entities (top 10)
}
"""
# Group by cluster
cluster_members = defaultdict(list)
for doi, cid in clusters.items():
cluster_members[cid].append(doi)
summaries = []
for cid, dois in sorted(cluster_members.items()):
# Count entities in this cluster
entity_counts = defaultdict(int)
for doi in dois:
for ent in self._paper_entities.get(doi, []):
entity_counts[ent] += 1
top_eEntityLayerBuilder.find_hub_papers method · python · L411-L447 (37 LOC)src/papersift/entity_layer.py
def find_hub_papers(self, top_k: int = 10) -> List[Dict]:
"""
Find Entity Hub Papers (Citation Major Paper alternative).
Hub Score = weighted degree = sum of shared entities with all neighbors
High hub score = paper shares many entities with many other papers
Args:
top_k: Number of top hub papers to return
Returns:
List of hub paper dicts (sorted by hub_score descending):
{
"doi": str,
"hub_score": int,
"entities": List[str] (top 10 entities)
}
"""
if self.graph is None:
raise ValueError("Call build_from_papers() first")
# Weighted degree = sum of edge weights
scores = self.graph.strength(weights='weight')
ranked = sorted(
zip(self._dois, scores),
key=lambda x: -x[1]
)[:top_k]
return [
{
'doi': doi,
'EntityLayerBuilder.find_papers_by_entity method · python · L449-L463 (15 LOC)src/papersift/entity_layer.py
def find_papers_by_entity(self, entity_name: str) -> List[str]:
"""
Find papers containing a specific entity (Seed Paper alternative).
Args:
entity_name: Entity to search for (case-insensitive)
Returns:
List of DOIs containing this entity
"""
entity_lower = entity_name.lower()
return [
doi for doi, entities in self._paper_entities.items()
if entity_lower in entities
]EntityLayerBuilder.expand_from_seed method · python · L465-L501 (37 LOC)src/papersift/entity_layer.py
def expand_from_seed(self, seed_doi: str, hops: int = 1) -> Set[str]:
"""
Expand from seed paper via shared entities (Citation stream alternative).
Like citation forward/backward traversal, but using entity connections.
Args:
seed_doi: Starting paper DOI
hops: Number of expansion hops (1 = direct neighbors only)
Returns:
Set of DOIs reachable within hops (including seed)
"""
if self.graph is None:
raise ValueError("Call build_from_papers() first")
if seed_doi not in self._dois:
raise ValueError(f"Seed DOI not in graph: {seed_doi}")
visited = {seed_doi}
current = {seed_doi}
for _ in range(hops):
next_layer = set()
for doi in current:
idx = self._dois.index(doi)
neighbors = self.graph.neighbors(idx)
for n_idx in neighbors:
n_doi = self.graph.EntityLayerBuilder.entity_stream method · python · L503-L564 (62 LOC)src/papersift/entity_layer.py
def entity_stream(
self,
start_doi: str,
strategy: str = 'strongest',
max_hops: int = 10
) -> List[str]:
"""
Follow entity connections like citation stream traversal.
Strategies:
- 'strongest': Follow edge with highest weight (most shared entities)
- 'diverse': Follow edge introducing most new entities
Args:
start_doi: Starting paper DOI
strategy: 'strongest' or 'diverse'
max_hops: Maximum path length
Returns:
Ordered path of DOIs from start
"""
if self.graph is None:
raise ValueError("Call build_from_papers() first")
path = [start_doi]
visited = {start_doi}
current = start_doi
for _ in range(max_hops):
idx = self._dois.index(current)
neighbors = self.graph.neighbors(idx)
if not neighbors:
break
candidates = []
create_app function · python · L23-L146 (124 LOC)src/papersift/ui/app.py
def create_app(papers_path: str, use_topics: bool = False) -> Dash:
"""
Create and configure the Dash application.
Args:
papers_path: Path to papers JSON file
use_topics: If True, use OpenAlex topics for enhanced clustering
Returns:
Configured Dash application
"""
# Load and process data
papers = load_papers(papers_path)
# Detect topics presence and override use_topics if data has topics
has_topics = any('topics' in p and p['topics'] for p in papers)
if has_topics and not use_topics:
use_topics = has_topics
clusters, builder = cluster_papers(papers, resolution=1.0, use_topics=use_topics)
elements = papers_to_cytoscape_elements(papers, clusters, builder)
rows = papers_to_table_data(papers, clusters)
colors = generate_cluster_colors(set(clusters.values()))
# Compute embedding for landscape (standalone, no builder needed)
embedding = compute_paper_embedding(papers, method="tsne", use_topicrun_server function · python · L149-L169 (21 LOC)src/papersift/ui/app.py
def run_server(
papers_path: str,
port: int = 8050,
debug: bool = False,
host: str = "127.0.0.1",
use_topics: bool = False,
):
"""
Run the Dash server.
Args:
papers_path: Path to papers JSON file
port: Server port (default 8050)
debug: Enable debug mode
host: Server host (default 127.0.0.1, use 0.0.0.0 for external access)
use_topics: If True, use OpenAlex topics for enhanced clustering
"""
app = create_app(papers_path, use_topics=use_topics)
url = f"http://{host}:{port}" if host != "0.0.0.0" else f"http://0.0.0.0:{port} (accessible externally)"
print(f"Starting PaperSift UI at {url}")
app.run(debug=debug, port=port, host=host)_push_checkpoint function · python · L7-L19 (13 LOC)src/papersift/ui/callbacks/clustering.py
def _push_checkpoint(history, checkpoint):
"""Push a checkpoint onto the history stack."""
h = dict(history)
checkpoints = list(h.get('checkpoints', []))
max_size = h.get('max_size', 20)
checkpoints.append(checkpoint)
if len(checkpoints) > max_size:
checkpoints = checkpoints[-max_size:]
h['checkpoints'] = checkpoints
h['current_index'] = len(checkpoints) - 1
return hWant fix-PRs on findings? Install Repobility's GitHub App · github.com/apps/repobility-bot
register_history_callbacks function · python · L6-L81 (76 LOC)src/papersift/ui/callbacks/history.py
def register_history_callbacks(app):
"""Register undo and history display callbacks."""
# Undo button
@app.callback(
Output('papers-data', 'data', allow_duplicate=True),
Output('cluster-data', 'data', allow_duplicate=True),
Output('cytoscape-network', 'elements', allow_duplicate=True),
Output('paper-table', 'rowData', allow_duplicate=True),
Output('cluster-colors', 'data', allow_duplicate=True),
Output('selection-store', 'data', allow_duplicate=True),
Output('embedding-data', 'data', allow_duplicate=True),
Output('navigation-state', 'data', allow_duplicate=True),
Output('history-stack', 'data', allow_duplicate=True),
Output('breadcrumb-container', 'children', allow_duplicate=True),
Input('undo-btn', 'n_clicks'),
State('history-stack', 'data'),
State('original-papers', 'data'),
State('resolution-slider', 'value'),
State('use-topics-flag', 'data'),
_push_checkpoint function · python · L215-L227 (13 LOC)src/papersift/ui/callbacks/navigation.py
def _push_checkpoint(history, checkpoint):
"""Push a checkpoint onto the history stack."""
h = dict(history)
checkpoints = list(h.get('checkpoints', []))
max_size = h.get('max_size', 20)
checkpoints.append(checkpoint)
if len(checkpoints) > max_size:
checkpoints = checkpoints[-max_size:]
h['checkpoints'] = checkpoints
h['current_index'] = len(checkpoints) - 1
return hregister_selection_callbacks function · python · L6-L80 (75 LOC)src/papersift/ui/callbacks/selection.py
def register_selection_callbacks(app):
"""Register all selection-related callbacks."""
# Network selection -> Store
@app.callback(
Output('selection-store', 'data', allow_duplicate=True),
Input('cytoscape-network', 'selectedNodeData'),
prevent_initial_call=True
)
def network_to_store(selected_nodes):
if selected_nodes is None:
return {'selected_dois': [], 'source': 'network'}
dois = [node['id'] for node in selected_nodes]
return {'selected_dois': dois, 'source': 'network'}
# Table selection -> Store
@app.callback(
Output('selection-store', 'data', allow_duplicate=True),
Input('paper-table', 'selectedRows'),
prevent_initial_call=True
)
def table_to_store(selected_rows):
if selected_rows is None:
return {'selected_dois': [], 'source': 'table'}
dois = [row['doi'] for row in selected_rows]
return {'selected_dois': dois, 'source': 'tabcreate_breadcrumb function · python · L6-L38 (33 LOC)src/papersift/ui/components/breadcrumb.py
def create_breadcrumb(path: list) -> html.Div:
"""
Render breadcrumb navigation showing the drill-down hierarchy.
Args:
path: list of cluster IDs forming the drill-down path
[] -> "All Papers"
[3] -> "All Papers > Cluster 3"
[3, 1] -> "All Papers > Cluster 3 > Sub 3.1"
Returns:
html.Div with breadcrumb trail
"""
items = [html.Span('All Papers', style={'fontWeight': 'bold'})]
for i, cid in enumerate(path):
items.append(html.Span(' > ', style={'color': '#999', 'margin': '0 4px'}))
label = f'Cluster {cid}' if i == 0 else f'Sub {".".join(str(x) for x in path[:i+1])}'
if i == len(path) - 1:
# Current level (not clickable)
items.append(html.Span(label, style={'fontWeight': 'bold', 'color': '#6f42c1'}))
else:
items.append(html.Span(label, style={'color': '#007bff'}))
return html.Div(
children=items,
style={
create_landscape_figure function · python · L7-L63 (57 LOC)src/papersift/ui/components/landscape.py
def create_landscape_figure(
embedding_data: Dict[str, list],
clusters: Dict[str, Any],
colors: Dict[Any, str],
papers: List[Dict[str, Any]],
):
"""Create Plotly scatter figure for paper landscape.
Args:
embedding_data: {doi: [x, y]} coordinates
clusters: {doi: cluster_id}
colors: {cluster_id: hex_color}
papers: paper list for hover info
Returns:
plotly Figure
"""
import plotly.graph_objects as go
doi_to_paper = {p['doi']: p for p in papers}
# Group DOIs by cluster
cluster_dois = {}
for doi, cid in clusters.items():
cluster_dois.setdefault(cid, []).append(doi)
fig = go.Figure()
for cid in sorted(cluster_dois.keys(), key=str):
dois = cluster_dois[cid]
xs = [embedding_data[d][0] for d in dois if d in embedding_data]
ys = [embedding_data[d][1] for d in dois if d in embedding_data]
titles = [doi_to_paper.get(d, {}).get('title', d)[:60] for d in docreate_landscape_component function · python · L66-L95 (30 LOC)src/papersift/ui/components/landscape.py
def create_landscape_component(
embedding_data: Dict[str, list],
clusters: Dict[str, Any],
colors: Dict[Any, str],
papers: List[Dict[str, Any]],
) -> html.Div:
"""Create Dash component containing the landscape scatter plot.
Args:
embedding_data: {doi: [x, y]} coordinates
clusters: {doi: cluster_id}
colors: {cluster_id: hex_color}
papers: paper list for hover info
Returns:
html.Div with dcc.Graph
"""
fig = create_landscape_figure(embedding_data, clusters, colors, papers)
return html.Div(
id='landscape-container',
children=[
dcc.Graph(
id='landscape-scatter',
figure=fig,
config={'displayModeBar': True, 'scrollZoom': True},
style={'height': '600px'},
),
]
)create_network_component function · python · L7-L52 (46 LOC)src/papersift/ui/components/network.py
def create_network_component(elements: list, stylesheet: list = None) -> html.Div:
"""
Create Cytoscape network component with box selection enabled.
Args:
elements: List of node/edge elements
stylesheet: Optional custom stylesheet
Returns:
Dash Div containing the Cytoscape component
"""
if stylesheet is None:
stylesheet = get_default_stylesheet()
return html.Div([
cyto.Cytoscape(
id='cytoscape-network',
elements=elements,
stylesheet=stylesheet,
layout={
'name': 'cose',
'animate': False,
'nodeRepulsion': 8000,
'idealEdgeLength': 100,
'edgeElasticity': 100,
'nestingFactor': 0.1,
'gravity': 0.25,
'numIter': 500,
'initialTemp': 200,
'coolingFactor': 0.95,
'minTemp': 1.0,
},
sget_default_stylesheet function · python · L55-L98 (44 LOC)src/papersift/ui/components/network.py
def get_default_stylesheet() -> list:
"""
Get default stylesheet for Cytoscape.
Nodes are colored by cluster, edges are gray.
Selected nodes have a thick border.
"""
return [
# Base node style
{
'selector': 'node',
'style': {
'label': 'data(label)',
'background-color': 'data(color)',
'width': 20,
'height': 20,
'font-size': '8px',
'text-valign': 'bottom',
'text-halign': 'center',
'text-wrap': 'ellipsis',
'text-max-width': '80px',
}
},
# Selected node style
{
'selector': 'node:selected',
'style': {
'border-width': 3,
'border-color': '#000',
'width': 30,
'height': 30,
}
},
# Base edge style
{
'selector': 'edge',
Source: Repobility analyzer · https://repobility.com
_escape_doi_for_selector function · python · L101-L107 (7 LOC)src/papersift/ui/components/network.py
def _escape_doi_for_selector(doi: str) -> str:
"""Escape DOI for use in CSS selector."""
# Escape backslash first, then other special chars
result = doi
for char in ['\\', '"', "'", '[', ']', '/', '.', ':']:
result = result.replace(char, f'\\{char}')
return resultget_highlight_stylesheet function · python · L110-L139 (30 LOC)src/papersift/ui/components/network.py
def get_highlight_stylesheet(
base_stylesheet: list,
selected_dois: list,
) -> list:
"""
Generate stylesheet with selected nodes highlighted.
Args:
base_stylesheet: Default stylesheet
selected_dois: List of selected DOIs
Returns:
Updated stylesheet
"""
stylesheet = base_stylesheet.copy()
# Add highlight rules for selected nodes
for doi in selected_dois:
escaped_doi = _escape_doi_for_selector(doi)
stylesheet.append({
'selector': f'node[id = "{escaped_doi}"]',
'style': {
'border-width': 3,
'border-color': '#ff0000',
'width': 30,
'height': 30,
}
})
return stylesheetcreate_sidebar function · python · L6-L176 (171 LOC)src/papersift/ui/components/sidebar.py
def create_sidebar() -> html.Div:
"""
Create sidebar with controls for filtering, navigation, and re-clustering.
Contains:
- Resolution slider
- Selection actions (Keep/Exclude/Reset)
- Navigation (Drill-down/Back)
- Undo
- Export
- Statistics
"""
return html.Div([
html.H3('Controls', style={'marginBottom': '20px'}),
# Resolution slider
html.Div([
html.Label('Cluster Resolution'),
dcc.Slider(
id='resolution-slider',
min=0.1,
max=3.0,
step=0.1,
value=1.0,
marks={0.5: '0.5', 1.0: '1.0', 2.0: '2.0', 3.0: '3.0'},
tooltip={'placement': 'bottom', 'always_visible': True},
updatemode='mouseup',
),
html.Small('Higher = more clusters', style={'color': '#666'})
], style={'marginBottom': '30px'}),
# Selection actions
html.Div([
page 1 / 2next ›