Full Reference Configuration¶

Warning

This is not recommended to copy and paste the configuration as is. The configuration is meant to be a reference and should be modified to fit your specific use case.

This is a full reference configuration example file for the Depictio CLI. It includes all available options and their descriptions. You can use this as a template to create your own configuration file.

# Required: Name of the project

name: "Strand-Seq data analysis"

# Optional : the URL of an external project management system reference
data_management_platform_project_url: "https://labid.embl.org/core/projects/default/5baa8f07-bd00-46e7-b3cb-ec79d01f6f3c"

# Required: Workflows that are available in the system
workflows:
  # Required: The name of the workflow
  - name: "mosaicatcher-pipeline"

    # Required: Engine that is used to run the workflow
    engine:
      # Required: The name of the engine
      name: "snakemake"

      # Optional: The version of the engine
      version: "7.32.0"

    # Optional : the URL of an external workflow catalog reference
    catalog:
      # Required: The name of the catalog (valid values: "smk-wf-catalog", "nf-core", "workflowhub")
      name: "smk-wf-catalog"

      # Required: The URL of the catalog
      url: "https://snakemake.github.io/snakemake-workflow-catalog/?repo=friendsofstrandseq/mosaicatcher-pipeline"

    # Optional: The description of the workflow
    description: "Strand-Seq SV calling framework"

    # Optional: The URL of the repository that contains the workflow
    repository_url: "https://github.com/friendsofstrandseq/mosaicatcher-pipeline"

    # Optional: Version of the workflow
    version: "2.3.5"

    # Required: The configuration to retrieve workflow data
    config:
      # Required: List of locations containing the workflow runs - multiple locations can be specified
      parent_runs_location:
        # Paths can be absolute or relative to the configuration file

        # Paths can contain environment variables which are resolved at runtime
        - "{DATA_LOCATION}/mosaicatcher-pipeline-2024"

      # Required: Regular expression to search the runs
      runs_regex: ".*"

    # Required: Data collections that are associated with the workflow
    data_collections:
      # Required: The tag that is used to identify the data collection
      - data_collection_tag: "mosaicatcher_stats"

        # Optional: A description of the data collection
        description: "Statistics file generated by MosaiCatcher"

        # Required: The configuration of the data collection
        config:
          # Required: The type of the data collection (options: Table, JBrowse2)
          type: "Table"

          # Required: The metatype of the data collection (options: Metadata, Aggregate ; Metadata corresponds to a single annotation file, Aggregate to the aggregation of multiple files into a single dataframe)
          metatype: "Aggregate"

          # Required: The configuration to scan the data collection files
          scan:
            # Required: The mode of the scan (options: single, recursive)
            # single: scan a single file
            # recursive: scan all files in a directory and its subdirectories
            mode: recursive

            # Required: The parameters of the scan - depends on the mode
            scan_parameters:
              # Required: The filename or directory to scan

              # If mode is recursive, use regex_config
              regex_config:
                # Required: The pattern to search for files
                pattern: ".*.info_raw"

              # If mode is single, use filename
              filename: "file_path.txt"

          # Required: The specific properties of the data collection - is related to the type of the data collection (Table here)
          dc_specific_properties:
            # Required: The file format of the data collection (options: CSV, TSV, XLSX, HDF5, Parquet)
            format: "TSV"

            # Required: The configuration of the Polars DataFrame for the data collection
            polars_kwargs:
              skip_rows: 13
              separator: "\t"
              # Other options for polars_kwargs:
              # - column_types
              # - column_names
              # ...

            # Optional: The columns that are kept in the table - all other columns are dropped
            keep_columns:
              - "sample"
              - "cell"
              - "mapped"
              - "dupl"
              - "pass1"
              - "good"

            # Optional: The description of the columns
            columns_description:
              sample: "Sample ID"
              cell: "Cell ID"
              mapped: "Total number of reads seen"
              dupl: "Reads filtered out as PCR duplicates"
              pass1: "Coverage compliant cells (binary)"
              good: "Reads used for counting"

          # Optional: Allow to join the data collections over specific columns or conditions
          join:
            # Required: The columns that are used to join the data collections
            on_columns:
              - "sample"
              - "cell"

            # Required: The type of join that is used to join the data collections
            how: "inner"

            # Required: The list of data collections that are used to join the data collection
            with_dc:
              - "ashleys_labels"