SnHiCPipeline


class chr3d.SnHiCPipeline(
    genome_index: str,
    chrom_sizes: str,
    threads: int = 1,
    assembly: str = 'hg38',
    min_mapq: int = 30,
    min_distance: int = 1000,
    resolutions: Optional[List[int]] = None,
    min_contacts_per_cell: int = 1000,
    min_complexity: float = 0.3,
    max_dup_rate: float = 0.8,
)

Complete single-nucleus Hi-C pipeline.

Parameters

Parameter	Type	Description
genome_index	`str`	Path to BWA-indexed genome FASTA
chrom_sizes	`str`	Path to chromosome sizes file
threads	`int`	Number of threads for each step (default: 1)
assembly	`str`	Genome assembly name (default: `'hg38'`)
min_mapq	`int`	Minimum mapping quality (default: 30)
min_distance	`int`	Minimum pair distance in bp (default: 1000)
resolutions	`Optional[List[int]]`	Matrix resolutions for pseudobulk `.mcool` (default: `[1000, 5000, 10000, 25000, 50000, 100000]`)
min_contacts_per_cell	`int`	Cell QC: min valid contacts (default: 1000)
min_complexity	`float`	Cell QC: min complexity ratio (default: 0.3)
max_dup_rate	`float`	Cell QC: max duplicate rate (default: 0.8)

Methods

run


def run(
    self,
    cells: List[Tuple[str, str, str]],
    output_dir: str,
    run_clustering: bool = False,
    cleanup: bool = False,
    start_from: int = 1,
) -> Dict[str, Any]

Run the complete sn-Hi-C pipeline on multiple cells, or resume later.

Parameters:

Parameter	Type	Description
cells	`List[Tuple[str, str, str]]`	List of `(cell_id, fastq_r1, fastq_r2)` tuples
output_dir	`str`	Root output directory
run_clustering	`bool`	Run GNN clustering after QC (default: `False`)
cleanup	`bool`	Remove per-cell intermediates after processing (default: `False`)
start_from	`int`	Step to resume from: 1=per-cell processing, 5=cell QC, 6=pseudobulk, 7=clustering (default: 1)

Returns:

Dict[str, Any] containing:

'per_cell_stats': Dict of per-cell processing stats
'passing_cells': List of QC-passing cell IDs
'failing_cells': List of QC-failing cell IDs
'pseudobulk': Pseudobulk output paths
'clustering': Clustering results (if run_clustering=True)
'timing': Step-by-step timing breakdown

Example:


from chr3d.hic import SnHiCPipeline
 
pipeline = SnHiCPipeline(
    genome_index="/data/genomes/hg38.fa",
    chrom_sizes="/data/genomes/hg38.chrom.sizes",
    threads=24,
)
 
# Sample manifest: list of (cell_id, fastq_r1, fastq_r2)
cells = [
    ("cell_001", "cell001_R1.fastq.gz", "cell001_R2.fastq.gz"),
    ("cell_002", "cell002_R1.fastq.gz", "cell002_R2.fastq.gz"),
]
 
stats = pipeline.run(
    cells=cells,
    output_dir="sn_hic_results/",
    run_clustering=True,
)
 
print(f"Passing cells: {len(stats['passing_cells'])}")
print(f"Pseudobulk: {stats['pseudobulk']['mcool']}")

process_cell


def process_cell(
    self,
    cell_id: str,
    fastq_r1: str,
    fastq_r2: str,
    output_dir: str,
    cleanup: bool = False,
) -> Dict[str, Any]

Run the full Hi-C processing pipeline for a single cell.

Reuses bulk Hi-C components (HiCAligner, HiCSamProcessor, HiCPairsProcessor, HiCMatrixGenerator).

Parameters:

Parameter	Type	Description
cell_id	`str`	Unique identifier for this cell
fastq_r1	`str`	Path to R1 FASTQ file
fastq_r2	`str`	Path to R2 FASTQ file
output_dir	`str`	Cell-specific output directory
cleanup	`bool`	Remove intermediate files after processing (default: `False`)

Returns:

Dict[str, Any] with keys:

'cell_id': Cell identifier
'status': Processing status ('success' or 'failed')
'duration_seconds': Processing time
'cool_file': Path to output .cool file
'num_contacts': Number of valid contacts
'align_stats', 'sam_stats', 'pairs_stats', 'matrix_stats': Detailed statistics

run_cell_qc


def run_cell_qc(
    self,
    cell_stats: Dict[str, Dict],
    output_dir: str,
) -> Tuple[List[str], List[str]]

Apply QC filters to all processed cells.

Parameters:

Parameter	Type	Description
cell_stats	`Dict[str, Dict]`	Dict mapping `cell_id` → per-cell stats from `process_cell()`
output_dir	`str`	Directory to write QC reports

Returns:

Tuple[List[str], List[str]] — (passing_cell_ids, failing_cell_ids)

run_pseudobulk


def run_pseudobulk(
    self,
    passing_cells: List[str],
    cell_cool_files: Dict[str, str],
    output_dir: str,
) -> Dict[str, Any]

Aggregate passing cells into a pseudobulk contact matrix.

Parameters:

Parameter	Type	Description
passing_cells	`List[str]`	List of passing cell IDs
cell_cool_files	`Dict[str, str]`	Dict mapping `cell_id` → path to `.cool` file
output_dir	`str`	Output directory for pseudobulk matrix

Returns:

Dict[str, Any] with keys:

'status': Aggregation status
'num_cells': Number of cells aggregated
'merged_cool': Path to merged .cool file
'mcool': Path to multi-resolution .mcool file

run_clustering


def run_clustering(
    self,
    passing_cells: List[str],
    cell_cool_files: Dict[str, str],
    output_dir: str,
    resolution: int = 1_000_000,
) -> Dict[str, Any]

Run GNN-based cell type clustering on passing cells.

Builds a cell×bin matrix from per-cell .cool files, then runs GraphSAGE + Leiden clustering.

Parameters:

Parameter	Type	Description
passing_cells	`List[str]`	List of passing cell IDs
cell_cool_files	`Dict[str, str]`	Dict mapping `cell_id` → path to `.cool` file
output_dir	`str`	Output directory for clustering results
resolution	`int`	Bin resolution in bp for cell×bin matrix (default: 1,000,000)

run_clustering is not yet fully implemented and will raise NotImplementedError.