← back to e-esteva__phenosuite

Function bodies 54 total

All specs Real LLM only Function bodies
_compute_bounds function · python · L24-L28 (5 LOC)
masquerade/production/masquerade.py
def _compute_bounds(spatial_metadata):
    """Return (x_min, x_max, y_min, y_max) from spatial metadata."""
    xs = spatial_metadata["x"].astype(int)
    ys = spatial_metadata["y"].astype(int)
    return int(xs.min()), int(xs.max()), int(ys.min()), int(ys.max())
_apply_crop function · python · L31-L36 (6 LOC)
masquerade/production/masquerade.py
def _apply_crop(arr_2d, bounds, adjust):
    """Crop a 2-D array to the bounding box when *adjust* is True."""
    if not adjust:
        return arr_2d
    x_min, x_max, y_min, y_max = bounds
    return arr_2d[y_min:y_max, x_min:x_max]
PreProcessImage function · python · L41-L58 (18 LOC)
masquerade/production/masquerade.py
def PreProcessImage(image_source, spatial_metadata, adjust_coords=True):
    """Load the TIFF, optionally crop to coordinate bounds.

    Returns
    -------
    image : ndarray  (C, H, W)
    raw_img_size : float  – approximate size in GB (uint8)
    bounds : tuple  – (x_min, x_max, y_min, y_max)
    """
    image = tifffile.imread(str(image_source))
    bounds = _compute_bounds(spatial_metadata)

    if adjust_coords:
        x_min, x_max, y_min, y_max = bounds
        image = image[:, y_min:y_max, x_min:x_max]

    raw_img_size = np.array(image, dtype="uint8").nbytes / 1e9
    return image, raw_img_size, bounds
get_mask_channels function · python · L61-L117 (57 LOC)
masquerade/production/masquerade.py
def get_mask_channels(
    image, spatial_metadata, raw_img_size, bounds, adjust_coords=True
):
    """Build per-cluster binary masks, dilate, compress.

    Returns
    -------
    channels : dict[str, ndarray]
    compression_factor : float
    """
    x_min, x_max, y_min, y_max = bounds
    cluster_ids = spatial_metadata["cluster"].unique()

    # Pre-compute the summed image once (across channels → 2-D)
    summed_image = image.sum(axis=0)  # (H, W)

    channels = {}
    compression_factor = 1.0

    for idx, cid in enumerate(cluster_ids):
        cluster = spatial_metadata[spatial_metadata["cluster"] == cid]
        if cluster.shape[0] <= 1:
            continue

        # Cluster pixel coordinates (optionally shifted to crop space)
        cx = cluster["x"].values.astype(int)
        cy = cluster["y"].values.astype(int)
        if adjust_coords:
            cx = cx - x_min
            cy = cy - y_min

        # Build boolean mask and dilate by 1 px (replaces manual ±1 expansion)
compress_marker_channels function · python · L120-L174 (55 LOC)
masquerade/production/masquerade.py
def compress_marker_channels(
    image_source,
    channels,
    compression_factor,
    spatial_metadata,
    bounds,
    relevant_markers=None,
    adjust_coords=True,
):
    """Read individual TIFF pages (lazy), crop, compress, and add to *channels*.

    Reads one page at a time so memory never exceeds ~1 page + output.
    """
    x_min, x_max, y_min, y_max = bounds

    # Build the marker whitelist set (if provided)
    marker_set = None
    if relevant_markers is not None:
        raw = list(relevant_markers["x"])
        variants = set(raw)
        for m in raw:
            variants.add(m.replace("_", ""))
            variants.add(m.replace("_", "-"))
        marker_set = variants

    with TiffFile(str(image_source)) as tif:
        for page in tif.series[0].pages:
            desc = page.description
            if not desc:
                continue
            el = ElementTree.fromstring(desc).find("Biomarker")
            if el is None or el.text is None:
                co
writeMaskTiff function · python · L177-L190 (14 LOC)
masquerade/production/masquerade.py
def writeMaskTiff(channels, outPath):
    """Stack all channels and write an ImageJ-compatible TIFF."""
    labels = list(channels.keys())
    stack = np.stack([channels[k] for k in labels], axis=0).astype("uint8")

    tifffile.imwrite(
        str(outPath),
        stack,
        imagej=True,
        metadata={"Labels": labels},
    )

    del stack
    gc.collect()
openApp function · javascript · L1-L20 (20 LOC)
site/assets/js/phenomenalist-site.js
function openApp(evt, appName) {
  // Declare all variables
  var i, tabcontent, tablinks;

  // Get all elements with class="tabcontent" and hide them
  tabcontent = document.getElementsByClassName("tabcontent");
  for (i = 0; i < tabcontent.length; i++) {
    tabcontent[i].style.display = "none";
  }

  // Get all elements with class="tablinks" and remove the class "active"
  tablinks = document.getElementsByClassName("tablinks");
  for (i = 0; i < tablinks.length; i++) {
    tablinks[i].className = tablinks[i].className.replace(" active", "");
  }

  // Show the current tab, and add an "active" class to the button that opened the tab
  document.getElementById(appName).style.display = "block";
  evt.currentTarget.className += " active";
}
Source: Repobility analyzer · https://repobility.com
model_spatial_interactions function · python · L1-L94 (94 LOC)
spatial_interactions/utils/spatial-shiny/model-spatial-interactions.py
def model_spatial_interactions(spatial_obj,out_dir,label,resolution,p1,p2,min_count):
  import os 
  import numpy as np
  import pandas as pd
  import scipy
  from scipy import spatial, io, sparse
  import re
  from itertools import chain
  
  

  os.makedirs(out_dir,exist_ok=True)
  
  print('outdir')
  print(out_dir)
  min_count = min_count
  
  
  dist_mat = scipy.spatial.distance.pdist(spatial_obj[['x','y']])
  dist_square = scipy.spatial.distance.squareform(dist_mat)

  # pull all class idx:
  celltypes=[re.sub(pattern='[+]',repl='pos',string=x) for x in spatial_obj['celltype'] ]
  classes = list(set(celltypes))



  class_idx = [[x for x in range(len(celltypes)) if len(re.findall(pattern='^'+str(y)+'$', string = celltypes[x])) > 0] for y in classes]
  
  class_sizes = np.array([len(x) for x in class_idx])

  resolution=resolution
  # minimum distance:
  p1=p1
  # maximum distance
  p2=p2
  p2=p2+p1
  
  p1_scaled=p1/resolution
  p2_scaled=p2/resolution

  phyper=[]
  logOdds=[]
 
read_csv_tsv function · python · L29-L45 (17 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def read_csv_tsv(filename):
    file = pd.read_csv(filename, delimiter='\t') #try tsv
    if 'Path' not in file.columns: #try comma-delim
        file = pd.read_csv(filename)        
    #if 'Path' not in file.columns:
    #    raise Exception(filename+" did not open properly!")
        
    #detect_legacy_vectra
    if 'Sample_Name' in file.columns: #underscores bad
        replace_parens = lambda x: '(' + x.group(0) + ')'
        file.columns = (file.columns.str.replace('_', ' ')
                        .str.replace('Opal [\S]*', replace_parens)
                        .str.replace('Normalized Counts Total Weighting',
                                     '(Normalized Counts, Total Weighting)')
                        .str.replace('HLA DR', 'HLA-DR')
                       )
    return file
extract_data function · python · L48-L204 (157 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def extract_data(directory, classification = None, verbose = True, 
                 drop_nan = True, drop_duplicates = False, debug = False):
    """
    Extracts cell information from cell_seg_data files and pairs it with 
    corresponding score files from score_data.

    Args:
        directory: string
            string of parent directory containing all files
        classification: function
            must take a row of the dataframe and output a string 
        verbose: bool
            output all quality-control checking
        drop_nan: bool
            remove rows with NaN phenotype
        drop_duplicates: bool
            whether to remove duplicates with the same file name. Most recently
            modified will be kept.
        debug: bool
            if True, only compile score info and return score_files

    Returns:
        output: list of dicts
            each corresponding to an image file
            
        unless debug is True, in which case:
        score
count_cells function · python · L206-L234 (29 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def count_cells(output, grouping = 'Phenotype', density = True):
    """
    Counts number or density of cells for each image.

    Args:
        output: list of dicts
            from extract_data
        grouping: 'Phenotype' or 'Classification' or list of variables
            whether to use VECTRA Phenotype or self-generated classification
        density: bool
            whether to density-normalize each image

    Returns:
        pandas df, with columns as image names and rows as cell types
    """
    
    counts_table = pd.concat(
        [sample['Data'].groupby(grouping).size().rename(sample['Sample Name']) 
         for sample in output], axis=1, sort=True).fillna(0)

    errors = counts_table.columns[counts_table.sum()==0]
    if len(errors) > 0:
        counts_table.drop(errors, axis=1, inplace=True)
        print("Warning: samples were omitted due to missing values: ", errors)
    
    if density:
        counts_table = (counts_table/counts_table.sum())       
        
 
pcf function · python · L236-L394 (159 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def pcf(output, cell_types = None, phenotype = 'Phenotype', 
        count_threshold = 1):
    """
    Calculates pair correlation function using spatstat from R.
    Extra dependencies: rpy2 (3.2.2+), R (3.6+), spatstat (1.62-2+)

    Args:
        output: list of dicts
            from extract_data
        cell_types: list
            Names of pertinent cell types, which will be prepended by 'All'. 
            Default is all recognized phenotypes
        phenotype: string, 'Phenotype' or 'Classification'
            whether to use VECTRA Phenotype or self-generated classification
        count_threshold: int
            threshold to not calculate a specific pcf

    Returns:
        pandas df, with each row a sample
    """

    #only needs rpy2 if calculating pcf
    import rpy2
    import rpy2.robjects as robjects
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    
    pandas2ri.activate()
    r = robjects.r
    spatstat = importr("spatstat.
pcf_subset function · python · L396-L427 (32 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def pcf_subset(df, cell1, cell2, min_count=20, max_radius = 200):
    """
    Analysis of pcfs of two specified cell types

    Args:
        df: pandas dataframe
            pcf dataframe from vl.pcf
        cell1, cell2: str
            Names of cell types of interest (or 'All') 
        min_count: int or list
            exclude samples with fewer than this many cells of each type
        max_radius: int
            steps to sum pcf to. steps are approx. 0.25 microns

    Returns:
        pandas df, with each row a sample
    """
    cell_mask = (((df['Cell_one']==cell1) & (df['Cell_two']==cell2)) 
                 | ((df['Cell_one']==cell2) & (df['Cell_two']==cell1)))
    count_mask = ((df['min_count'] > min_count) 
                  if type(min_count) == int 
                  else ((df['count_one'] > min_count[0]) 
                        & (df['count_two'] > min_count[1])))
    pcf_table = df[cell_mask & count_mask].copy()
    pcf_table['normalization'] = pd.to_numeric(pcf_table
error_median function · python · L429-L444 (16 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def error_median(x):
    """
    Calculate shaded plot regions by bootstrapping for error bars. 
    
    Example of use for plotting:
    pcf_median = data.groupby(variable)['PCF'].apply(
               lambda x: np.median(np.vstack(x), axis=0))
    pcf_error = data.groupby(variable)['PCF'].apply(error_median)

    """
    bootstrap = np.vstack([np.median(
        np.vstack(x)[np.random.choice(len(x), len(x), replace=True),:]
        , axis=0) for q in range(100)])
    
    return [np.percentile(bootstrap, 2.5, axis = 0), 
            np.percentile(bootstrap, 97.5, axis = 0)]
nearest_neighbor function · python · L446-L495 (50 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def nearest_neighbor(output, phenotype = 'Phenotype', k = 1):
    """
    Args:
        output: list of dicts
            from extract_data
        phenotype: string, 'Phenotype' or 'Classification'
            whether to use VECTRA Phenotype or self-generated classification
        k: int
            number of neighbors to average
    Returns:
        pandas df, 
        Mean distance from each cell type (first index level) to the 
        closest k neighbors of a given cell type (second index level), for each 
        sample (column)
    """
    
    nn_output = []
      
    knn_mean_function = lambda x: (x.apply(pd.Series.nsmallest, axis=1, n=k)
                                    .mean(axis = 1))
    knn_group_function = lambda group: (group.groupby(group.columns, axis=1)
                                             .apply(knn_mean_function)
                                             .mean())
        
    for sample in output:
        selection = sample['Data'] 
        sample_n
Repobility's GitHub App fixes findings like these · https://github.com/apps/repobility-bot
run_all function · python · L497-L523 (27 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def run_all(output, folder_location):
    """
    Args:
        output: list of dicts
            from extract_data
        folder_location: string
            where to save files
    Returns:
        None
    """
    xy = pd.concat([x['Data'].assign(Sample = x['Sample Name']) 
                    for x in output])
    xy.rename(columns={'Sample':'Sample Name'}, inplace=True)
    xy.to_csv(folder_location+"/xy.csv")
    
    pcf_df = pcf(output, phenotype = 'Classification', count_threshold=10)
    pcf_df.to_csv(folder_location+"/pcf.csv")
     
    nn_output = []
    for k in [1,5]:
        nn_output.append(nearest_neighbor(output, phenotype = 'Classification',
                                          k = k).stack().rename(k))
    nn = pd.concat(nn_output, axis=1)
    nn.index.set_names('Sample Name', level=2, inplace=True)
    nn.to_csv(folder_location+"/nearest_neighbor.csv")
    
    return None
interaction_subset function · python · L531-L534 (4 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def interaction_subset(df, cell1, cell2, min_count=20):
    pcf_table = df[(((df['Cell_one']==cell1) & (df['Cell_two']==cell2)) | ((df['Cell_one']==cell2) & (df['Cell_two']==cell1)) ) & (df['min_count'] > min_count)]
    pcf_table = pcf_table[pcf_table['PCF'] != 'NA']
    return pcf_table
plot_difference function · python · L538-L553 (16 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def plot_difference(data, ax = False):
    pcf_dm = np.median(np.vstack(data['normPCF']),axis=0)
    pcf_dsem = error_median(data['normPCF'])
    
    if ax == False:
        fig, ax = plt.subplots()
        fig.set_size_inches(8,5)

    ax.plot(x_data,  pcf_dm)
    ax.fill_between(x_data, pcf_dsem[0], pcf_dsem[1],  alpha = .4, lw=0)

    ax.set_title(data['Cell_one'].iloc[0] + ' vs. ' + data['Cell_two'].iloc[0], fontsize = 18);
    ax.plot(x_data, np.ones(len(x_data)), 'k--');
    ax.set_xlabel('Radius ($\mu$m)', fontsize = 16);
    ax.set_ylabel('PCF', fontsize = 16);
    ax.tick_params(axis='both', which='major', labelsize=14);
error_median function · python · L566-L568 (3 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def error_median(x):
    bootstrap = np.vstack([np.median(np.vstack(x)[np.random.choice(len(x), len(x), replace=True),:],axis=0) for q in range(100)])
    return [np.percentile(bootstrap, 2.5, axis = 0), np.percentile(bootstrap, 97.5, axis = 0)]
plot_difference function · python · L570-L585 (16 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def plot_difference(data, ax = False):
    pcf_dm = np.median(np.vstack(data['normPCF']),axis=0)
    pcf_dsem = error_median(data['normPCF'])
    
    if ax == False:
        fig, ax = plt.subplots()
        fig.set_size_inches(8,5)

    ax.plot(x_data,  pcf_dm)
    ax.fill_between(x_data, pcf_dsem[0], pcf_dsem[1],  alpha = .4, lw=0)

    ax.set_title(data['Cell_one'].iloc[0] + ' vs. ' + data['Cell_two'].iloc[0], fontsize = 18);
    ax.plot(x_data, np.ones(len(x_data)), 'k--');
    ax.set_xlabel('Radius ($\mu$m)', fontsize = 16);
    ax.set_ylabel('PCF', fontsize = 16);
    ax.tick_params(axis='both', which='major', labelsize=14);
interaction_subset function · python · L587-L590 (4 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def interaction_subset(df, cell1, cell2, min_count=20):
    pcf_table = df[(((df['Cell_one']==cell1) & (df['Cell_two']==cell2)) | ((df['Cell_one']==cell2) & (df['Cell_two']==cell1)) ) & (df['min_count'] > min_count)]
    pcf_table = pcf_table[pcf_table['PCF'] != 'NA']
    return pcf_table
plot_pcf_curves function · python · L594-L681 (88 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def plot_pcf_curves(pcf_df,cell_types_,resolution,label,out_path):
  CELL_TYPES_ALIAS =['All']
  CELL_TYPES_ALIAS.extend(cell_types_)
  
  CELL_TYPES = CELL_TYPES_ALIAS
  COLOR_LIST = sns.color_palette(None, len(CELL_TYPES))
  print(COLOR_LIST)
  
  x_data = np.arange(0, 125, .25) 
  
  fig = plt.figure(figsize=(10, 8))
  outer = mpl.gridspec.GridSpec(2, 3, wspace=0.2, hspace=0.2)
  print(str(outer.ncols))
  print(str(outer.nrows))
  print(len(CELL_TYPES))
  for i in range(len(CELL_TYPES)):
    
    inner = mpl.gridspec.GridSpecFromSubplotSpec(2, 1,
                    subplot_spec=outer[i], wspace=0.1, hspace=0.1, height_ratios=[1,2])

    
    ax = plt.Subplot(fig, inner[0])
    ax2 = plt.Subplot(fig, inner[1])
    ax.set_title(CELL_TYPES_ALIAS[i], fontsize = 16);
    ax2.plot(x_data, np.ones(len(x_data)), 'k--');
    
    for j in range(len(CELL_TYPES)):
        
        
        data = interaction_subset(pcf_df, CELL_TYPES[i], CELL_TYPES[j])
      
        pcf_dm = np.median(np.vst
pcf_AUC function · python · L683-L778 (96 LOC)
spatial_interactions/utils/spatial-shiny/vectra_lib_v3.py
def pcf_AUC(pcf_df,cell_types_,cell1,label,out_path,resolution):
  CELL_TYPES = ['All']
  CELL_TYPES.extend(cell_types_)
  print(CELL_TYPES)
  CELL_TYPES_ALIAS = CELL_TYPES
 
  CELL2_LIST = CELL_TYPES
  print(CELL_TYPES)
  CELL2_LIST.pop(CELL_TYPES.index(cell1))
  print(CELL_TYPES)
  LABELS = CELL_TYPES
  print(cell1)
  print(LABELS)
  
  print('Cell 2 list:')
  print(CELL2_LIST)
  COLORS = sns.color_palette(None, len(CELL_TYPES))
  STEP_TO_UM = resolution

  def interaction_subset(df, cell1, cell2, min_count=20):
    pcf_table = df[(((df['Cell_one']==cell1) & (df['Cell_two']==cell2)) | ((df['Cell_one']==cell2) & (df['Cell_two']==cell1)) ) & (df['min_count'] > min_count)]
    pcf_table = pcf_table[pcf_table['PCF'] != 'NA']
    return pcf_table


  def pvalue_text(data1, data2, verbose = False):
      pvalue = stats.mannwhitneyu(data1, data2, alternative='two-sided')[1]
      if verbose:
          print(pvalue)
      if pvalue < 0.001:
          return '***'
      elif pvalue < 0.01:
  
Methodology: Repobility · https://repobility.com/research/state-of-ai-code-2026/
model_spatial_interactions function · python · L1-L94 (94 LOC)
utils/spatial-shiny/model-spatial-interactions.py
def model_spatial_interactions(spatial_obj,out_dir,label,resolution,p1,p2,min_count):
  import os 
  import numpy as np
  import pandas as pd
  import scipy
  from scipy import spatial, io, sparse
  import re
  from itertools import chain
  
  

  os.makedirs(out_dir,exist_ok=True)
  
  print('outdir')
  print(out_dir)
  min_count = min_count
  
  
  dist_mat = scipy.spatial.distance.pdist(spatial_obj[['x','y']])
  dist_square = scipy.spatial.distance.squareform(dist_mat)

  # pull all class idx:
  celltypes=[re.sub(pattern='[+]',repl='pos',string=x) for x in spatial_obj['celltype'] ]
  classes = list(set(celltypes))



  class_idx = [[x for x in range(len(celltypes)) if len(re.findall(pattern='^'+str(y)+'$', string = celltypes[x])) > 0] for y in classes]
  
  class_sizes = np.array([len(x) for x in class_idx])

  resolution=resolution
  # minimum distance:
  p1=p1
  # maximum distance
  p2=p2
  p2=p2+p1
  
  p1_scaled=p1/resolution
  p2_scaled=p2/resolution

  phyper=[]
  logOdds=[]
 
read_csv_tsv function · python · L31-L47 (17 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def read_csv_tsv(filename):
    file = pd.read_csv(filename, delimiter='\t') #try tsv
    if 'Path' not in file.columns: #try comma-delim
        file = pd.read_csv(filename)        
    #if 'Path' not in file.columns:
    #    raise Exception(filename+" did not open properly!")
        
    #detect_legacy_vectra
    if 'Sample_Name' in file.columns: #underscores bad
        replace_parens = lambda x: '(' + x.group(0) + ')'
        file.columns = (file.columns.str.replace('_', ' ')
                        .str.replace('Opal [\S]*', replace_parens)
                        .str.replace('Normalized Counts Total Weighting',
                                     '(Normalized Counts, Total Weighting)')
                        .str.replace('HLA DR', 'HLA-DR')
                       )
    return file
extract_data function · python · L50-L209 (160 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def extract_data(directory, classification = None, verbose = True, 
                 drop_nan = True, drop_duplicates = False, debug = False):
    """
    Extracts cell information from cell_seg_data files and pairs it with 
    corresponding score files from score_data.

    Args:
        directory: string
            string of parent directory containing all files
        classification: function
            must take a row of the dataframe and output a string 
        verbose: bool
            output all quality-control checking
        drop_nan: bool
            remove rows with NaN phenotype
        drop_duplicates: bool
            whether to remove duplicates with the same file name. Most recently
            modified will be kept.
        debug: bool
            if True, only compile score info and return score_files

    Returns:
        output: list of dicts
            each corresponding to an image file
            
        unless debug is True, in which case:
        score
count_cells function · python · L211-L239 (29 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def count_cells(output, grouping = 'Phenotype', density = True):
    """
    Counts number or density of cells for each image.

    Args:
        output: list of dicts
            from extract_data
        grouping: 'Phenotype' or 'Classification' or list of variables
            whether to use VECTRA Phenotype or self-generated classification
        density: bool
            whether to density-normalize each image

    Returns:
        pandas df, with columns as image names and rows as cell types
    """
    
    counts_table = pd.concat(
        [sample['Data'].groupby(grouping).size().rename(sample['Sample Name']) 
         for sample in output], axis=1, sort=True).fillna(0)

    errors = counts_table.columns[counts_table.sum()==0]
    if len(errors) > 0:
        counts_table.drop(errors, axis=1, inplace=True)
        print("Warning: samples were omitted due to missing values: ", errors)
    
    if density:
        counts_table = (counts_table/counts_table.sum())       
        
 
pcf function · python · L241-L420 (180 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def pcf(output, cell_types = None, phenotype = 'Phenotype', 
        count_threshold = 1):
    """
    Calculates pair correlation function using spatstat from R.
    Extra dependencies: rpy2 (3.2.2+), R (3.6+), spatstat (1.62-2+)

    Args:
        output: list of dicts
            from extract_data
        cell_types: list
            Names of pertinent cell types, which will be prepended by 'All'. 
            Default is all recognized phenotypes
        phenotype: string, 'Phenotype' or 'Classification'
            whether to use VECTRA Phenotype or self-generated classification
        count_threshold: int
            threshold to not calculate a specific pcf

    Returns:
        pandas df, with each row a sample
    """

    #only needs rpy2 if calculating pcf
    import rpy2
    import rpy2.robjects as robjects
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    
    pandas2ri.activate()
    r = robjects.r
    spatstat = importr("spatstat.
pcf_subset function · python · L422-L453 (32 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def pcf_subset(df, cell1, cell2, min_count=20, max_radius = 200):
    """
    Analysis of pcfs of two specified cell types

    Args:
        df: pandas dataframe
            pcf dataframe from vl.pcf
        cell1, cell2: str
            Names of cell types of interest (or 'All') 
        min_count: int or list
            exclude samples with fewer than this many cells of each type
        max_radius: int
            steps to sum pcf to. steps are approx. 0.25 microns

    Returns:
        pandas df, with each row a sample
    """
    cell_mask = (((df['Cell_one']==cell1) & (df['Cell_two']==cell2)) 
                 | ((df['Cell_one']==cell2) & (df['Cell_two']==cell1)))
    count_mask = ((df['min_count'] > min_count) 
                  if type(min_count) == int 
                  else ((df['count_one'] > min_count[0]) 
                        & (df['count_two'] > min_count[1])))
    pcf_table = df[cell_mask & count_mask].copy()
    pcf_table['normalization'] = pd.to_numeric(pcf_table
error_median function · python · L455-L470 (16 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def error_median(x):
    """
    Calculate shaded plot regions by bootstrapping for error bars. 
    
    Example of use for plotting:
    pcf_median = data.groupby(variable)['PCF'].apply(
               lambda x: np.median(np.vstack(x), axis=0))
    pcf_error = data.groupby(variable)['PCF'].apply(error_median)

    """
    bootstrap = np.vstack([np.median(
        np.vstack(x)[np.random.choice(len(x), len(x), replace=True),:]
        , axis=0) for q in range(100)])
    
    return [np.percentile(bootstrap, 2.5, axis = 0), 
            np.percentile(bootstrap, 97.5, axis = 0)]
nearest_neighbor function · python · L472-L521 (50 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def nearest_neighbor(output, phenotype = 'Phenotype', k = 1):
    """
    Args:
        output: list of dicts
            from extract_data
        phenotype: string, 'Phenotype' or 'Classification'
            whether to use VECTRA Phenotype or self-generated classification
        k: int
            number of neighbors to average
    Returns:
        pandas df, 
        Mean distance from each cell type (first index level) to the 
        closest k neighbors of a given cell type (second index level), for each 
        sample (column)
    """
    
    nn_output = []
      
    knn_mean_function = lambda x: (x.apply(pd.Series.nsmallest, axis=1, n=k)
                                    .mean(axis = 1))
    knn_group_function = lambda group: (group.groupby(group.columns, axis=1)
                                             .apply(knn_mean_function)
                                             .mean())
        
    for sample in output:
        selection = sample['Data'] 
        sample_n
Hi, dataset curator — please cite Repobility (https://repobility.com) when reusing this data.
run_all function · python · L523-L549 (27 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def run_all(output, folder_location):
    """
    Args:
        output: list of dicts
            from extract_data
        folder_location: string
            where to save files
    Returns:
        None
    """
    xy = pd.concat([x['Data'].assign(Sample = x['Sample Name']) 
                    for x in output])
    xy.rename(columns={'Sample':'Sample Name'}, inplace=True)
    xy.to_csv(folder_location+"/xy.csv")
    
    pcf_df = pcf(output, phenotype = 'Classification', count_threshold=10)
    pcf_df.to_csv(folder_location+"/pcf.csv")
     
    nn_output = []
    for k in [1,5]:
        nn_output.append(nearest_neighbor(output, phenotype = 'Classification',
                                          k = k).stack().rename(k))
    nn = pd.concat(nn_output, axis=1)
    nn.index.set_names('Sample Name', level=2, inplace=True)
    nn.to_csv(folder_location+"/nearest_neighbor.csv")
    
    return None
interaction_subset function · python · L557-L560 (4 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def interaction_subset(df, cell1, cell2, min_count=20):
    pcf_table = df[(((df['Cell_one']==cell1) & (df['Cell_two']==cell2)) | ((df['Cell_one']==cell2) & (df['Cell_two']==cell1)) ) & (df['min_count'] > min_count)]
    pcf_table = pcf_table[pcf_table['PCF'] != 'NA']
    return pcf_table
plot_difference function · python · L564-L579 (16 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def plot_difference(data, ax = False):
    pcf_dm = np.median(np.vstack(data['normPCF']),axis=0)
    pcf_dsem = error_median(data['normPCF'])
    
    if ax == False:
        fig, ax = plt.subplots()
        fig.set_size_inches(8,5)

    ax.plot(x_data,  pcf_dm)
    ax.fill_between(x_data, pcf_dsem[0], pcf_dsem[1],  alpha = .4, lw=0)

    ax.set_title(data['Cell_one'].iloc[0] + ' vs. ' + data['Cell_two'].iloc[0], fontsize = 18);
    ax.plot(x_data, np.ones(len(x_data)), 'k--');
    ax.set_xlabel('Radius ($\mu$m)', fontsize = 16);
    ax.set_ylabel('PCF', fontsize = 16);
    ax.tick_params(axis='both', which='major', labelsize=14);
error_median function · python · L592-L594 (3 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def error_median(x):
    bootstrap = np.vstack([np.median(np.vstack(x)[np.random.choice(len(x), len(x), replace=True),:],axis=0) for q in range(100)])
    return [np.percentile(bootstrap, 2.5, axis = 0), np.percentile(bootstrap, 97.5, axis = 0)]
plot_difference function · python · L596-L611 (16 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def plot_difference(data, ax = False):
    pcf_dm = np.median(np.vstack(data['normPCF']),axis=0)
    pcf_dsem = error_median(data['normPCF'])
    
    if ax == False:
        fig, ax = plt.subplots()
        fig.set_size_inches(8,5)

    ax.plot(x_data,  pcf_dm)
    ax.fill_between(x_data, pcf_dsem[0], pcf_dsem[1],  alpha = .4, lw=0)

    ax.set_title(data['Cell_one'].iloc[0] + ' vs. ' + data['Cell_two'].iloc[0], fontsize = 18);
    ax.plot(x_data, np.ones(len(x_data)), 'k--');
    ax.set_xlabel('Radius ($\mu$m)', fontsize = 16);
    ax.set_ylabel('PCF', fontsize = 16);
    ax.tick_params(axis='both', which='major', labelsize=14);
interaction_subset function · python · L613-L619 (7 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def interaction_subset(df, cell1, cell2, min_count=20):
    pcf_table = df[(((df['Cell_one']==cell1) & (df['Cell_two']==cell2)) | ((df['Cell_one']==cell2) & (df['Cell_two']==cell1)) ) & (df['min_count'] > min_count)]
    
    #assert pcf_table.shape[0] == 0, str(pcf_table.shape) + ' | '+str(cell1) + ' | '+str(cell2) + ' | '+str(pcf_table.head())+' | '+str(np.array(np.array(pcf_table['PCF'])[0]) != 'NA')
    
    #pcf_table = pcf_table[np.array(pcf_table['PCF'])[0] != 'NA']
    return pcf_table
plot_pcf_curves function · python · L623-L716 (94 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def plot_pcf_curves(pcf_df,cell_types_,resolution,label,out_path):
  CELL_TYPES_ALIAS =['All']
  CELL_TYPES_ALIAS.extend(cell_types_)
  
  CELL_TYPES = CELL_TYPES_ALIAS
  COLOR_LIST = sns.color_palette(None, len(CELL_TYPES))
  print(COLOR_LIST)
  
  x_data = np.arange(0, 125, .25) 
  
  fig = plt.figure(figsize=(10, 8))
  outer = mpl.gridspec.GridSpec(3, 3, wspace=0.2, hspace=0.2)
  print(str(outer.ncols))
  print(str(outer.nrows))
  print(len(CELL_TYPES))
  
  #assert pcf_df.shape[0] > 1000,str(pcf_df['PCF']) + str(pcf_df.columns) + str(pcf_df.shape) + str(set(pcf_df['min_count']))

  for i in range(len(CELL_TYPES)):
    print('i: '+str(i))
    inner = mpl.gridspec.GridSpecFromSubplotSpec(2, 1,
                    subplot_spec=outer[i], wspace=0.1, hspace=0.1, height_ratios=[1,2])

    
    ax = plt.Subplot(fig, inner[0])
    ax2 = plt.Subplot(fig, inner[1])
    ax.set_title(CELL_TYPES_ALIAS[i], fontsize = 16);
    ax2.plot(x_data, np.ones(len(x_data)), 'k--');
    
    for j in range
pcf_AUC function · python · L718-L813 (96 LOC)
utils/spatial-shiny/vectra_lib_v3.py
def pcf_AUC(pcf_df,cell_types_,cell1,label,out_path,resolution):
  CELL_TYPES = ['All']
  CELL_TYPES.extend(cell_types_)
  print(CELL_TYPES)
  CELL_TYPES_ALIAS = CELL_TYPES
 
  CELL2_LIST = CELL_TYPES
  print(CELL_TYPES)
  CELL2_LIST.pop(CELL_TYPES.index(cell1))
  print(CELL_TYPES)
  LABELS = CELL_TYPES
  print(cell1)
  print(LABELS)
  
  print('Cell 2 list:')
  print(CELL2_LIST)
  COLORS = sns.color_palette(None, len(CELL_TYPES))
  STEP_TO_UM = resolution

  def interaction_subset(df, cell1, cell2, min_count=20):
    pcf_table = df[(((df['Cell_one']==cell1) & (df['Cell_two']==cell2)) | ((df['Cell_one']==cell2) & (df['Cell_two']==cell1)) ) & (df['min_count'] > min_count)]
    #pcf_table = pcf_table[pcf_table['PCF'] != 'NA']
    return pcf_table


  def pvalue_text(data1, data2, verbose = False):
      pvalue = stats.mannwhitneyu(data1, data2, alternative='two-sided')[1]
      if verbose:
          print(pvalue)
      if pvalue < 0.001:
          return '***'
      elif pvalue < 0.01:
 
Source: Repobility analyzer · https://repobility.com
read_csv_tsv function · python · L29-L45 (17 LOC)
utils/spatial-shiny/vectra_lib_v4.py
def read_csv_tsv(filename):
    file = pd.read_csv(filename, delimiter='\t') #try tsv
    if 'Path' not in file.columns: #try comma-delim
        file = pd.read_csv(filename)        
    #if 'Path' not in file.columns:
    #    raise Exception(filename+" did not open properly!")
        
    #detect_legacy_vectra
    if 'Sample_Name' in file.columns: #underscores bad
        replace_parens = lambda x: '(' + x.group(0) + ')'
        file.columns = (file.columns.str.replace('_', ' ')
                        .str.replace('Opal [\S]*', replace_parens)
                        .str.replace('Normalized Counts Total Weighting',
                                     '(Normalized Counts, Total Weighting)')
                        .str.replace('HLA DR', 'HLA-DR')
                       )
    return file
extract_data function · python · L48-L207 (160 LOC)
utils/spatial-shiny/vectra_lib_v4.py
def extract_data(directory, classification = None, verbose = True, 
                 drop_nan = True, drop_duplicates = False, debug = False):
    """
    Extracts cell information from cell_seg_data files and pairs it with 
    corresponding score files from score_data.

    Args:
        directory: string
            string of parent directory containing all files
        classification: function
            must take a row of the dataframe and output a string 
        verbose: bool
            output all quality-control checking
        drop_nan: bool
            remove rows with NaN phenotype
        drop_duplicates: bool
            whether to remove duplicates with the same file name. Most recently
            modified will be kept.
        debug: bool
            if True, only compile score info and return score_files

    Returns:
        output: list of dicts
            each corresponding to an image file
            
        unless debug is True, in which case:
        score
count_cells function · python · L209-L237 (29 LOC)
utils/spatial-shiny/vectra_lib_v4.py
def count_cells(output, grouping = 'Phenotype', density = True):
    """
    Counts number or density of cells for each image.

    Args:
        output: list of dicts
            from extract_data
        grouping: 'Phenotype' or 'Classification' or list of variables
            whether to use VECTRA Phenotype or self-generated classification
        density: bool
            whether to density-normalize each image

    Returns:
        pandas df, with columns as image names and rows as cell types
    """
    
    counts_table = pd.concat(
        [sample['Data'].groupby(grouping).size().rename(sample['Sample Name']) 
         for sample in output], axis=1, sort=True).fillna(0)

    errors = counts_table.columns[counts_table.sum()==0]
    if len(errors) > 0:
        counts_table.drop(errors, axis=1, inplace=True)
        print("Warning: samples were omitted due to missing values: ", errors)
    
    if density:
        counts_table = (counts_table/counts_table.sum())       
        
 
pcf function · python · L239-L413 (175 LOC)
utils/spatial-shiny/vectra_lib_v4.py
def pcf(output, cell_types = None, phenotype = 'Phenotype', 
        count_threshold = 1,radius=30,resolution=0.377):
    """
    Calculates pair correlation function using spatstat from R.
    Extra dependencies: rpy2 (3.2.2+), R (3.6+), spatstat (1.62-2+)

    Args:
        output: list of dicts
            from extract_data
        cell_types: list
            Names of pertinent cell types, which will be prepended by 'All'. 
            Default is all recognized phenotypes
        phenotype: string, 'Phenotype' or 'Classification'
            whether to use VECTRA Phenotype or self-generated classification
        count_threshold: int
            threshold to not calculate a specific pcf

    Returns:
        pandas df, with each row a sample
    """

    #only needs rpy2 if calculating pcf
    import rpy2
    import rpy2.robjects as robjects
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    
    pandas2ri.activate()
    r = robjects.r
    sp
pcf_subset function · python · L415-L446 (32 LOC)
utils/spatial-shiny/vectra_lib_v4.py
def pcf_subset(df, cell1, cell2, min_count=20, max_radius = 200):
    """
    Analysis of pcfs of two specified cell types

    Args:
        df: pandas dataframe
            pcf dataframe from vl.pcf
        cell1, cell2: str
            Names of cell types of interest (or 'All') 
        min_count: int or list
            exclude samples with fewer than this many cells of each type
        max_radius: int
            steps to sum pcf to. steps are approx. 0.25 microns

    Returns:
        pandas df, with each row a sample
    """
    cell_mask = (((df['Cell_one']==cell1) & (df['Cell_two']==cell2)) 
                 | ((df['Cell_one']==cell2) & (df['Cell_two']==cell1)))
    count_mask = ((df['min_count'] > min_count) 
                  if type(min_count) == int 
                  else ((df['count_one'] > min_count[0]) 
                        & (df['count_two'] > min_count[1])))
    pcf_table = df[cell_mask & count_mask].copy()
    pcf_table['normalization'] = pd.to_numeric(pcf_table
error_median function · python · L448-L463 (16 LOC)
utils/spatial-shiny/vectra_lib_v4.py
def error_median(x):
    """
    Calculate shaded plot regions by bootstrapping for error bars. 
    
    Example of use for plotting:
    pcf_median = data.groupby(variable)['PCF'].apply(
               lambda x: np.median(np.vstack(x), axis=0))
    pcf_error = data.groupby(variable)['PCF'].apply(error_median)

    """
    bootstrap = np.vstack([np.median(
        np.vstack(x)[np.random.choice(len(x), len(x), replace=True),:]
        , axis=0) for q in range(100)])
    
    return [np.percentile(bootstrap, 2.5, axis = 0), 
            np.percentile(bootstrap, 97.5, axis = 0)]
nearest_neighbor function · python · L465-L514 (50 LOC)
utils/spatial-shiny/vectra_lib_v4.py
def nearest_neighbor(output, phenotype = 'Phenotype', k = 1):
    """
    Args:
        output: list of dicts
            from extract_data
        phenotype: string, 'Phenotype' or 'Classification'
            whether to use VECTRA Phenotype or self-generated classification
        k: int
            number of neighbors to average
    Returns:
        pandas df, 
        Mean distance from each cell type (first index level) to the 
        closest k neighbors of a given cell type (second index level), for each 
        sample (column)
    """
    
    nn_output = []
      
    knn_mean_function = lambda x: (x.apply(pd.Series.nsmallest, axis=1, n=k)
                                    .mean(axis = 1))
    knn_group_function = lambda group: (group.groupby(group.columns, axis=1)
                                             .apply(knn_mean_function)
                                             .mean())
        
    for sample in output:
        selection = sample['Data'] 
        sample_n
run_all function · python · L516-L542 (27 LOC)
utils/spatial-shiny/vectra_lib_v4.py
def run_all(output, folder_location):
    """
    Args:
        output: list of dicts
            from extract_data
        folder_location: string
            where to save files
    Returns:
        None
    """
    xy = pd.concat([x['Data'].assign(Sample = x['Sample Name']) 
                    for x in output])
    xy.rename(columns={'Sample':'Sample Name'}, inplace=True)
    xy.to_csv(folder_location+"/xy.csv")
    
    pcf_df = pcf(output, phenotype = 'Classification', count_threshold=10)
    pcf_df.to_csv(folder_location+"/pcf.csv")
     
    nn_output = []
    for k in [1,5]:
        nn_output.append(nearest_neighbor(output, phenotype = 'Classification',
                                          k = k).stack().rename(k))
    nn = pd.concat(nn_output, axis=1)
    nn.index.set_names('Sample Name', level=2, inplace=True)
    nn.to_csv(folder_location+"/nearest_neighbor.csv")
    
    return None
Repobility's GitHub App fixes findings like these · https://github.com/apps/repobility-bot
interaction_subset function · python · L550-L553 (4 LOC)
utils/spatial-shiny/vectra_lib_v4.py
def interaction_subset(df, cell1, cell2, min_count=20):
    pcf_table = df[(((df['Cell_one']==cell1) & (df['Cell_two']==cell2)) | ((df['Cell_one']==cell2) & (df['Cell_two']==cell1)) ) & (df['min_count'] > min_count)]
    #pcf_table = pcf_table[pcf_table['PCF'] != 'NA']
    return pcf_table
plot_difference function · python · L557-L572 (16 LOC)
utils/spatial-shiny/vectra_lib_v4.py
def plot_difference(data, ax = False):
    pcf_dm = np.median(np.vstack(data['normPCF']),axis=0)
    pcf_dsem = error_median(data['normPCF'])
    
    if ax == False:
        fig, ax = plt.subplots()
        fig.set_size_inches(8,5)

    ax.plot(x_data,  pcf_dm)
    ax.fill_between(x_data, pcf_dsem[0], pcf_dsem[1],  alpha = .4, lw=0)

    ax.set_title(data['Cell_one'].iloc[0] + ' vs. ' + data['Cell_two'].iloc[0], fontsize = 18);
    ax.plot(x_data, np.ones(len(x_data)), 'k--');
    ax.set_xlabel('Radius ($\mu$m)', fontsize = 16);
    ax.set_ylabel('PCF', fontsize = 16);
    ax.tick_params(axis='both', which='major', labelsize=14);
error_median function · python · L585-L587 (3 LOC)
utils/spatial-shiny/vectra_lib_v4.py
def error_median(x):
    bootstrap = np.vstack([np.median(np.vstack(x)[np.random.choice(len(x), len(x), replace=True),:],axis=0) for q in range(100)])
    return [np.percentile(bootstrap, 2.5, axis = 0), np.percentile(bootstrap, 97.5, axis = 0)]
page 1 / 2next ›