# Please check the parameters, and adjust them according to your needs # Project name PROJECT: Test1 # ================== Control of the workflow ================== ## Do you want to download FASTQ files from public from Sequence Read Archive (SRA) ? SRA: no # "yes" or "no". # If set to "yes", the workflow will stop after the QC to let you decide whether you want to trim your raw data or not. # In order to run the rest of the workflow, you have to set it to "no". ## Do you need to do quality control? QC: no # "yes" or "no". # If set to "yes", the workflow will stop after the QC to let you decide whether you want to trim your raw data or not. # In order to run the rest of the workflow, you have to set it to "no". ## Do you need to do trimming? TRIMMED: yes # "yes" or "no"? ## Do you need to do mapping and feature counting? MAPPING: yes # "yes" or "no" ## Which mapping reference do you want to use? Genome or transcriptome? REFERENCE: genome # "genome" or "transcriptome" ## Do you want to study the repeats? REPEATS: yes # "yes" or "no" ## Do you want to do Differential Expression Analysis (DEA)? DEA: yes # "yes" or "no" # ================== Shared parameters for some or all of the sub-workflows ================== ## key file if the data is stored remotely, otherwise leave it empty KEY: ## place to save singularity/apptainer image (1.5Go) SINGULARITY: /shared/projects/bi4edc ## the path to fastq files READSPATH: TestDataset/Raw_fastq # for testing purpose we put here the relative paths, but we recommend to always use full paths such as /shared/projects/YourProjectName/Raw_Fastq ## the meta file describing the experiment settings METAFILE: TestDataset/configs/metadata.tsv ## paths for intermediate final results BIGDATAPATH: TestDataset/data # for big files RESULTPATH: TestDataset/results ## is the sequencing paired-end or single-end? END: pair # "pair" or "single" ## maximum number of cores you want to allocate to one job of the workflow (mapping and feature counting) NCORE: 32 ## maximum number of jobs running in parallel NJOBS: 100 # ================== Configuration for Quality Control ================== ## All required params have already been defined in the public params # ================== Configuration for trimming ================== ## Number of trimmed bases ## put "no" for TRIM3 and TRIM5 if you don't want to trim a fixed number of bases. TRIM5: no # integer or "no", remove N bp from the 5' end of reads. This may be useful if the qualities were very poor, or if there is some sort of unwanted bias at the 5' end. TRIM3: no # integer or "no", remove N bp from the 3' end of reads AFTER adapter/quality trimming has been performed. # ================== Configuration for quantification using transcriptome ================== ## transcriptome file TRANS: /shared/banks/homo_sapiens/hg38/transcriptome/gencode.v41.transcripts.fa # tests on going ## Do you need to do gene-level differential expression analysis? GENE_LEVEL: TRUE # TRUE or FALSE. If FALSE, ignore the following 3 parameters. ## If TRUE, specify the corresponding dataset in ENSEMBL for your interested organism or provide your own tx2gene ENSEMBL: TRUE # TRUE or FALSE. Specify whether you're using transcriptome from Ensembl EnsemblDataSet: hsapiens_gene_ensembl # only if ENSEMBL was set to TRUE. Search for your dataset in the file EnsemblDataSet_look_up_table.csv ## If you're not using ENSEMBL, you have to provide your homemade tx2gene file (two columns, 1st col: transcript ID; 2nd col: gene ID) TX2GENE: tx2gene_custom.tsv # only if ENSEMBL was set to FALSE # ================== Configuration for alignment to genome and feature count ================== ## aligner ALIGNER: HISAT2 # "STAR" or "HISAT2" ## genome and annotation files INDEXPATH: TestDataset/hisat2_index # /shared/bank/homo_sapiens/hg38/hisat2 # folder containing index files INDEXBASE: hisat2_hg38_chr22 # for hisat2, base of the name of the index files (ie "genome" if the file format is as genome.1.ht2) ANNOTATION: TestDataset/gtf/gencode.v34.annotation_chr22.gtf # GTF file ## bigwig option BWSTRANDED: yes # "no": bw merging forward and reverse reads, "yes": get 2 bw files, one forward and one reverse; "both": get the two bw per strand as well as the merge one. ## tool for feature count COUNTER: featureCounts # "featureCounts" or "htseq-count" or "STARcount" (only with STAR aligner, --quantMode GeneCounts option) or "TEcount" (if REPEATS: yes) ## counting options COUNTOPTIONS: "-M --fraction" # add extra options for the counter (for featureCounts or htseq-count only). # featureCounts: '-O' (set allowMultiOverlap to TRUE), '-M' (set countMultiMappingReads to TRUE), '--fraction'. # htseq-count: -m ; --nonunique=; for instance "-m intersection-nonempty --nonunique=all" ... see https://htseq.readthedocs.io ATTRIBUTE: gene_id # the attribute used in annotation file. It's usually "gene_id", but double check that since it may also be "gene", "ID"... STRAND: reverse # "no", "yes", "reverse". For stranded=no, a read is considered overlapping with a feature regardless of whether it is mapped to the same or the opposite strand as the feature. For stranded=yes and single-end reads, the read has to be mapped to the same strand as the feature. For paired-end reads, the first read has to be on the same strand and the second read on the opposite strand. For stranded=reverse, these rules are reversed. FEATURE: transcript # "exon", "gene", "transcript", ... depending on your GTF file and on the feature you're interested in. SPLITBY: 50000000 # To get efficient counting, the big bam files have to be split into smaller files. Put here the number of reads per sub-file. ie 50000000 for featureCounts, 20000000 for htseq-count. # ================== Configuration for repeat analysis ================== GTFTE: TestDataset/gtf/hg38_TE_rmsk_chr22.gtf # GTF ANNOTATION file for repeats, must be adapted to have the FEATURE you chose ("exon", "gene", "transcript") as 3rd column. # ================== Configuration for DEA ================== ## Do you want to start the workflow directly from the count tables obtained independently? If yes, give the path of your count tables. FROMCOUNTS: no # put 'no' when using the whole workflow, or the path of your count tables. Format: one file / sample named {sample}_countGenes.tsv, 2 columns gene'\tab'count. ## Do you want to use edgeR or DESeq2 to do DEA? DEATOOL: DESeq2 # "edgeR" or "DESeq2"? DESeq2 is recommended for transcriptome-based DEA ## Is your experiment designed in a pair-wise way? PAIR: no # Is this a pair test or not? (yes or no). For instance 2 samples from the same patient taken at different times. ## the comparison(s) you want to do. If multiple comparisons, specify each pair (CONTROL & TREAT) in order respectively CONTROL: ["J0_WT"] TREAT: ["J10_WT"] ## length of 'CONTROL' should agree with that of 'TREAT' ## what you fill in there should agree with the "group" column in metadata.tsv FILTER: yes # Filter out low expressed transcripts/genes or not? (yes or no) It's better to be set to "yes". # ================== Configuration for visualization ================== ## All required params have already been defined in the public params ========================================== SAMPLE PLAN sample group subject D197-D192T27r J0_WT 1 D197-D192T28r J0_WT 2 D197-D192T29r J0_WT 3 D197-D192T33r J10_WT 1 D197-D192T34r J10_WT 2 D197-D192T35r J10_WT 3 ========================================== CONDA ENV name: rasflow_EDC channels: - conda-forge - bioconda - r - defaults dependencies: # conda-forge channel installs - R=4.1 - python=3.9.13 - graphviz=3.0.0 - r-yaml=2.3.5 - r-statmod=1.4.36 - r-gplots=3.1.3 - r-magick=2.7.3 - r-dt=0.23 - r-sessioninfo=1.2.2 - r-knitr=1.39 - r-heatmap.plus=1.3 - r-readr=2.1.2 - r-hash - r-pheatmap=1.0.12 - r-rcolorbrewer=1.1_3 - imagemagick=7.1.0 # bioconda channel installs - snakemake=7.8.3 - fastqc=0.11.9 - trim-galore=0.6.7 - multiqc=1.12 - salmon=1.9.0 - hisat2=2.2.1 - star=2.7.10a - samtools=1.15.1 - subread=2.0.1 # featureCounts included - htseq=2.0.1 # htseq-count included - bioconductor-edger=3.36.0 - bioconductor-deseq2=1.34.0 - qualimap=2.2.2d - bioconductor-mygene=1.30.0 - bioconductor-tximport=1.22.0 - bioconductor-enhancedvolcano=1.12.0 - bioconductor-biomart=2.50.0 - deeptools=3.5.1 - bioconductor-regionreport=1.27.1 - bioconductor-glimma=2.4.0 - pysam=0.19.1 - picard=2.27.3 - parallel-fastq-dump=0.6.7 - salmon=1.9.0 ========================================== CLUSTER __default__: mem: 500 name: snakejob cpus: 1 download_singularity_img: name: singImage qualityControl: mem: 6000 name: QC cpus: 2 trim: mem: 6000 name: trimming cpus: 8 trimS: mem: 6000 name: trimming cpus: 4 hisat2: mem: 7000 name: hisat2 cpus: 32 star: mem: 40000 name: star cpus: 4 alignmentQC: mem: 10000 name: aligQC cpus: 8 BigWig: mem: 5000 name: BigWig cpus: 8 BigWigR: mem: 5000 name: BigWigR cpus: 8 BigWigF: mem: 5000 name: BigWigF cpus: 8 featureCount: mem: 10000 name: featureCount cpus: 4 htseqCount: mem: 100000 name: htseqCount plot: name: visualization BamIndex: name: BamIndex PCA: name: PCA mem: 2000 TEPCA: name: TEPCA mem: 2000 summaryReport: name: multiQC mem: 1000 MappingReport: name: multiQC DEA: name: DEA mem: 2000 TE_DEA: name: TE_DEA mem: 2000 quantify: name: quantTrans getReads: name: getReads starCount: name: starCount TEcount: mem: 30000 name : TEcount cpus: 24 Samtools: mem: 30000 name : Samtools cpus: 8 splitBam : mem: 1000 name : Picards sortBam: name: sortBam cpus: 8 mem: 10000 spliceSites: name: spliceSites mem: 1000 splitCountTables: name: splitCountTables mergeCountTables: name: mergeCountTables mergeSummaries: name: mergeSummaries TE_index: name: TE_index mem: 10000 gene_index: name: gene_index mem: 1000 mergeGTF: name: mergeGTF mem: 10000 fastqDump: mem: 10000 name: parallel-fastq-dump cpus: 4 indexTrans: mem: 10000 name: salmon_index cpus: 8 quantify: mem: 10000 name: salmon_quant cpu: 8 combineSamples: mem: 5000 name: combine_trans ========================================== VERSION commit b83b90ffcf7aad2d0448ccc8697f8697da94b914 Author: hennion Date: Mon Oct 31 15:44:04 2022 +0100