Full Reference Configuration¶
Warning
This is not recommended to copy and paste the configuration as is. The configuration is meant to be a reference and should be modified to fit your specific use case.
This is a full reference configuration example file for the Depictio CLI. It includes all available options and their descriptions. You can use this as a template to create your own configuration file.
# Required: Name of the project
name: "Strand-Seq data analysis"
# Optional : the URL of an external project management system reference
data_management_platform_project_url: "https://labid.embl.org/core/projects/default/5baa8f07-bd00-46e7-b3cb-ec79d01f6f3c"
# Required: Workflows that are available in the system
workflows:
# Required: The name of the workflow
- name: "mosaicatcher-pipeline"
# Required: Engine that is used to run the workflow
engine:
# Required: The name of the engine
name: "snakemake"
# Optional: The version of the engine
version: "7.32.0"
# Optional : the URL of an external workflow catalog reference
catalog:
# Required: The name of the catalog (valid values: "smk-wf-catalog", "nf-core", "workflowhub")
name: "smk-wf-catalog"
# Required: The URL of the catalog
url: "https://snakemake.github.io/snakemake-workflow-catalog/?repo=friendsofstrandseq/mosaicatcher-pipeline"
# Optional: The description of the workflow
description: "Strand-Seq SV calling framework"
# Optional: The URL of the repository that contains the workflow
repository_url: "https://github.com/friendsofstrandseq/mosaicatcher-pipeline"
# Optional: Version of the workflow
version: "2.3.5"
# Required: The configuration to retrieve workflow data
config:
# Required: List of locations containing the workflow runs - multiple locations can be specified
parent_runs_location:
# Paths can be absolute or relative to the configuration file
# Paths can contain environment variables which are resolved at runtime
- "{DATA_LOCATION}/mosaicatcher-pipeline-2024"
# Required: Regular expression to search the runs
runs_regex: ".*"
# Required: Data collections that are associated with the workflow
data_collections:
# Required: The tag that is used to identify the data collection
- data_collection_tag: "mosaicatcher_stats"
# Optional: A description of the data collection
description: "Statistics file generated by MosaiCatcher"
# Required: The configuration of the data collection
config:
# Required: The type of the data collection (options: Table, JBrowse2)
type: "Table"
# Required: The metatype of the data collection (options: Metadata, Aggregate ; Metadata corresponds to a single annotation file, Aggregate to the aggregation of multiple files into a single dataframe)
metatype: "Aggregate"
# Required: The configuration to scan the data collection files
scan:
# Required: The mode of the scan (options: single, recursive)
# single: scan a single file
# recursive: scan all files in a directory and its subdirectories
mode: recursive
# Required: The parameters of the scan - depends on the mode
scan_parameters:
# Required: The filename or directory to scan
# If mode is recursive, use regex_config
regex_config:
# Required: The pattern to search for files
pattern: ".*.info_raw"
# If mode is single, use filename
filename: "file_path.txt"
# Required: The specific properties of the data collection - is related to the type of the data collection (Table here)
dc_specific_properties:
# Required: The file format of the data collection (options: CSV, TSV, XLSX, HDF5, Parquet)
format: "TSV"
# Required: The configuration of the Polars DataFrame for the data collection
polars_kwargs:
skip_rows: 13
separator: "\t"
# Other options for polars_kwargs:
# - column_types
# - column_names
# ...
# Optional: The columns that are kept in the table - all other columns are dropped
keep_columns:
- "sample"
- "cell"
- "mapped"
- "dupl"
- "pass1"
- "good"
# Optional: The description of the columns
columns_description:
sample: "Sample ID"
cell: "Cell ID"
mapped: "Total number of reads seen"
dupl: "Reads filtered out as PCR duplicates"
pass1: "Coverage compliant cells (binary)"
good: "Reads used for counting"
# Optional: Allow to join the data collections over specific columns or conditions
join:
# Required: The columns that are used to join the data collections
on_columns:
- "sample"
- "cell"
# Required: The type of join that is used to join the data collections
how: "inner"
# Required: The list of data collections that are used to join the data collection
with_dc:
- "ashleys_labels"