#!/usr/bin/bash


# CONTENTS
# ========
#
# This file shows, step-by-step, an example down-stream analysis of alternative splicing with Matt.
# These down-stream analyses start after inclusion levels of alternative splicing events have been
# estimated with dedicated software.
#
# Section 1. : INFORMATION ON USED RNA-SEQ DATA SETS
# Section 2. : INSTALLATION OF MATT
# Section 3. : QUICK START: ONLY THE ESSENTIAL ANALYSES STEPS FROM SECTION 6 (no vast-tools for estimating inclusion levels needed)
# Section 4. : DOWNLOAD RNA-SEQ DATA FROM GEO WITH HELP OF MATT
# Section 5. : ESTIMATE INCLUSION LEVELS OF AS EVENTS OF ANY TYPE GENOME-WIDE WITH PROGRAM VAST-TOOLS (vast-tools must be installed)
# Section 6. : DOWN-STREAM ANALYSIS WITH MATT
#
# Note: Lines from this script are meant to be copy'n'pasted into the terminal where Matt needs to be available.
#
#
# 1. INFORMATION ON USED RNA-SEQ DATA SETS
# ========================================
#
# A) Paper
# NOVA2-mediated RNA regulation is required for axonal pathfinding during development, 2016, eLife 2016;5:e14371 doi: 10.7554/eLife.14371
#
# B) RNA-seq data sets, brain samples from mouse wild-type and Nova2 KD
# GEO accession number: GSE69711
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE69711
#
#
# 2. INSTALLATION OF MATT
# =======================
#
# 2.1. create folder for installation
mkdir matt
cd matt
#
# 2.2. download archive
wget matt.crg.eu/matt.tar.gz
# or on Mac
curl -OL matt.crg.eu/matt.tar.gz
#
# 2.3. decompress
tar -xvzf matt.tar.gz
#
# 2.4. run INSTALL script
mkdir 700 ./INSTALL
./INSTALL
# or
source INSTALL
#
# 2.5. add link to matt to any folder which is in your PATH variable
cd /path/to/PATH/folder
ln -s /path/to/folder/where/matt/was/installed/matt
#
# 2.6. call Matt to see commands
matt
#
#
# 3. QUICK START: ONLY THE ESSENTIAL ANALYSES STEPS
# =================================================
#
# You need to have installed Matt and available Perl, R, pdflatex (latex)
#
# 3.1 Retrieve table with inclusion levels of splicing events (SE, IR, ALT3, ALT5), which was computed with vast-tools
wget http://matt.crg.eu/big_data/INCLUSION_LEVELS_FULL-Mmu8-mm9.tab.gz
# or on Mac
# curl -OL http://matt.crg.eu/big_data/INCLUSION_LEVELS_FULL-Mmu8-mm9.tab.gz
gunzip INCLUSION_LEVELS_FULL-Mmu8-mm9.tab.gz
#
# 3.2 Extract from table with events genome-wide skipped exons
# get_vast is a utility command for extracting splicing events from vast-tools output tables
# - complex : restricts retrieved events; vast-tools encodes skipped-exon events with IDs S, C1, C2, C3, MIC
# -a nova2kd : is the merged Nova2 KD sample
# -b nova2wt : is the merged Nova2 WT sample
# -minqab VLOW : vast-tools reports for each estimated PSI value a quality (N,VLOW,LOW,OK,SOK) and we restrict here the output to events to those for which the nova2kd and the nova2wt PSI could be estimated with at least VLOW quality.
# -minqglob N : similar to -minqab but applies to all other samples in the table beside the samples defined with arguments -a and -b.
# Setting it to N prevents the exclusion of events, which have VLOW in -a and -b samples, but worse quality in the other samples.
# -gtf and -f : gene IDs will be extracted automatically
gtf=/path/to/...../Mmu09.gtf
matt get_vast INCLUSION_LEVELS_FULL-Mmu8-mm9.tab -complex S,C1,C2,C3,MIC -a nova2kd -b nova2wt -minqab VLOW -minqglob N -gtf $gtf -f gene_id > exons.tab
#
# 3.3 Define groups of exons to be compared later with various analyses
# Three groups get defined: silenced exons have a dPSI in [25,100], enhanced in [-100,-25], unregulated in [-1,-1] and a PSI in wild type in [10,90] (the latter constraint excluded constitutive exons from the group of unregulated exons.
# Conditions for groups are defined with: silenced=DPSI_GRPA_MINUS_GRPB[25,100] meaning group silenced are all events (rows) where the value in column DPSI_GRPA_MINUS_GRPB is in [25,100].
# Several conditions for one group will be connected by the logic operator AND, e.g., as in case of group unregulated.
matt def_cats exons.tab GROUP 'silenced=DPSI_GRPA_MINUS_GRPB[25,100]' 'enhanced=DPSI_GRPA_MINUS_GRPB[-100,-25]' 'unregulated=DPSI_GRPA_MINUS_GRPB[-1,1] PSI_nova2wt[10,90]' | matt add_cols exons.tab -
#
# 3.4 Feature extraction of exons and discriminative feature detection
# ... which includes automatic retrieval of genomic sequences and detection of up/down-stream introns and exons.
# Therefore, the user needs to specify a FASTA file with chromosome sequences of mouse (Mmu09) and a genome annotation from Ensembl.
# Sequence IDs therein must match to sequence IDs in exons.tab, which are Ensembl gene IDs as this table was generated by vast-tools.
# For ease of this example, users can download the Mmu09 chromosomes and gene annotation here:
# Linux: wget http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz (700 MB)
# wget http://matt.crg.eu/big_data/Mmu09.gtf.gz (15 MB)
# Mac : curl -OL http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz
# curl -OL http://matt.crg.eu/big_data/Mmu09.gtf.gz
# gunzip Mmu09_gDNA.fasta.gz
# gunzip Mmu09.gtf.gz
fasta=/path/to/...../Mmu09_gDNA.fasta
gtf=/path/to/...../Mmu09.gtf
output_dir=cmpr_exons
matt cmpr_exons exons.tab START END SCAFFOLD STRAND GENEID $gtf $fasta Mmus 150 GROUP[silenced,enhanced,unregulated] $output_dir -colors:deepskyblue,firebrick4,lightgreen
#
# Infos: the output folder contains a
# 1.) summary PDF report with box plots (summary.pdf)
# 2.) a table with all events and extracted features (exons_with_efeatures.tab)
# 3.) a table with all p values and information on performed statistical tests (overview_of_features_and_comparisons.tab)
# 4.) all box plots as PDFs in directory summary_graphics
# More infos: This analysis can be done with all input tables describing exons, wherever they come from, if they contain the necessary pieces of information:
# 1.) exon start coordinate
# 2.) exon end
# 3.) scaffold/chromosome of exon
# 4.) strand
# 5.) any type of gene ID which matches the gene IDs in the given GTF file with gene annotation
# 6.) group IDs
#
# 3.5 Generate motif RNA-maps for all CISBP-RNA consensus motifs comparing the three exon groups
# A FASTA file with chromosome sequences of mouse (Mmu09) must be given as sequences of the exons and flanking introns
# need to be retrieved for generating a motif RNA-map. Sequence IDs therein must match sequence IDs in exons.tab.
# For ease if this example, users can download chromosome sequences for Mmu09 here:
# Linux: wget http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz (700 MB)
# Mac : curl -OL http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz
fasta=/path/to/...../Mmu09_gDNA.fasta
output_dir=cisbprna_maps
matt rna_maps_cisbp exons.tab UPSTRM_EX_BORDER START END DOSTRM_EX_BORDER SCAFFOLD STRAND GROUP[silenced,enhanced,unregulated] 31 35 135 $fasta cisbprna_regexps -d $output_dir
#
# Infos: the output folder contains a
# 1.) summary with all motif RNA-maps (0_all_motif_rna_maps.pdf); the maps are ordered according to differences of the enrichment scores between groups
# (silenced vs. unregulated and enhanced vs unregulated) from most positive differences to most negative differences.
# Hence, maps with most differences come first and last.
# 2.) all motif RNA-maps as PDFs
# More infos: This analysis can be done with all input tables describing exons, wherever they come from, if they contain the necessary pieces of information:
# 1.) exon start coordinate
# 2.) exon end
# 3.) scaffold/chromosome of exon
# 4.) strand
# 5.) flanking borders of flanking exons
# 6.) group IDs
#
# 3.6 Generate a Nova2 RNA-map with a Nova2 regular expression derived from known Nova2 binding sites
# To generate a motif RNAmap with a Nova2 binding motif, first a table describing this motif must be created.
# As the Matt command rna_maps accepts motifs defined by Perl regular expressions and position weight matrices (PWMs), the
# table needs to have 5 columns including THRESH and BGMODEL which only apply for PWMs. As the Nova2 motif is a
# regular expression, the value NA is put as value in these two columns.
echo -e "TYPE\tNAME\tREGEXP\tTHRESH\tBGMODEL
REGEXP\tNOVA2\t[CT]CA[CT](.{0,3}[CT]{0,1}CA[CT]|.{4,23}[CT]CA[CT]){2}\tNA\tNA
" > nova2_regexp.tab
# The necessary chromosome sequence for Mmu09 can be downloaded here:
# Linux: wget http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz (700 MB)
# Mac : curl -OL http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz
fasta=/path/to/...../Mmu09_gDNA.fasta
output_dir=nova2_map
# By specifying the argument -frd 0.01 10000, p-values will be estimated for the motif-enrichment differences between
# silenced vs unregulated and enhanced vs. regulated with a permutation test (1000 permutations) and regions in the
# motif RNA-map will be highlighted according to a FDR threshold of 0.01 .
# This call runs 1-2 hours due to the permutation test.
matt rna_maps exons.tab UPSTRM_EX_BORDER START END DOSTRM_EX_BORDER SCAFFOLD STRAND GROUP[silenced,enhanced,unregulated] 31 35 135 $fasta nova2_regexp.tab TYPE NAME REGEXP THRESH BGMODEL -d $output_dir -fdr 0.01 1000
#
# 3.7 Extract 50 nt long sequences upstream of exons
# For Extracting sequences Matt offers the utility command get_seqs, which can be invoked in three different modi.
# The second modus is designed for extracting upstream or downstream sequences of AS events, like exons.
# Again, a FASTA file with chromosome sequences is necessary, and can be downloaded here:
# Linux: wget http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz (700 MB)
# Mac : curl -OL http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz
fasta=/path/to/...../Mmu09_gDNA.fasta
# Sequences from strand - will be reverse complemented automatically.
# Piping allows to add the generated two columns with sequences (SEQ_UP and SEQ_DOWN) directly to the exon table.
matt get_seqs exons.tab START END 50 50 SCAFFOLD STRAND $fasta | matt add_cols exons.tab -
#
# 3.8 Enrichment analysis in extracted 50 nt long upstream sequences with all CISBP-RNA motifs
#
# This command outputs results in form of a table to STDOUT which will be redirected into a file.
output_file=cisbprna_enrichment_test.tab
matt test_cisbp_enrich exons.tab SEQ_UP GROUP[silenced,unregulated,enhanced,unregulated] 1000 > $output_file
#
#
#
# 4. DOWNLOAD RNA-SEQ DATA FROM GEO WITH HELP OF MATT
# ===================================================
#
# Necessary only, if you want to do the estimation of inclusion levels with vast-tools by your own (Section 5).
# For this estimation, the program vast-tools needs to be installed.
#
# RNAseq paired-end data from mouse cortex; 3 x Nova2 KD and 3 x WT
#
# Step 4.1: The file accession_numbers.txt describes one data set per line, listing its ID (must be taken from the GEO website)
# and, optionally, a file name for the final sequencing-read files. Extracted read files will be renamed if file names are given.
# This file must not contain a header, and entries per line can be separated by blank, tab, or comma.
# This file can contain more columns; Only the first two columns will be considered. All further columns will be ignored.
echo "SRR3532927 nova2kd3
SRR3532926 nova2kd2
SRR3532925 nova2kd1
SRR3532924 nova2wt3
SRR3532923 nova2wt2
SRR3532922 nova2wt1
" > accession_numbers.txt
#
# Step 4.2: Using Matt for downloading data from the GEO RNAseq repository.
# accession_numbers.txt : contains data set IDs and file names, i.e., extracted sequencing-read files will be renamed
# -fasta : extracts FASTA files (if omitted, FASTQ files would be extracted)
# -keepsra : downloaded archives will not be deleted (if omitted, they would be deleted after read extraction)
# -o rnaseq_data : set output directory to rnaseq_data
# -p 6 : use 6 parallel processes for downloading and extraction
#
# Notes: - downloaded data will be in total approx. 6 x 6 GB = 36 GB
# - will take some time
# - to save space, you might want to delete the downloaded SRA archives in folder rnaseq_data or run
# the following command without argument -keepsra
# - as vast-tools does not consider quality scores of sequences reads, extracting FASTA files instead
# of FASTQ files saves storage space
matt retr_rnaseq accession_numbers.txt -fasta -keepsra -o rnaseq_data -p 6
#
#
# 5. ESTIMATE INCLUSION LEVELS OF AS EVENTS OF ANY TYPE GENOME-WIDE WITH PROGRAM VAST-TOOLS
# =========================================================================================
#
# Note: Matt is designed for the down-stream analysis of splicing events.
# Hence, estimating inclusion levels from RNA-seq data for splicing events needs to be done with other programs.
# Vast-tools is one of these programs, and it needs to be installed with data base of splicing events in mouse
# (you will be asked for installation of this splicing-event data base after having installed vast-tools and
# starting it for the first time. More information on vast-tools (with download) can be found here:
# https://github.com/vastgroup/vast-tools
#
# Note: Estimating inclusion levels with VAST-TOOLS is done in three steps:
# 3.1.) for each RNAseq sample: reads will be mapped to the data base of splicing events
# 3.2.) optional: merge mapping statistics of several samples into a new sample (e.g. merge replicates into one final sample)
# 3.3.) estimate inclusion levels genome-wide for all types of events and output a final table with rows corresponding to events
# and columns to information on events including inclusion levels for all RNA-seq data sets.
#
# Step 5.1: Mapping each sample with VAST-TOOLS align
for file in nova2kd1 nova2kd2 nova2kd3 nova2wt1 nova2wt2 nova2wt3
do
# These jobs should be sent to a HPC cluster.
# vast-tools align rnaseq_data/${file}_1.fasta.gz rnaseq_data/${file}_2.fasta.gz -sp Mmu
done
#
# Step 5.2: Merging the three Nova2 KD replicates into one sample, and the three Nova2 WT replicated into one sample
# A rational of merging replicates into one sample is to increase the read coverage. For good estimates of
# inclusion levels in human or mouse, RNA-seq data sets with at least 100 Mio reads are recommended, 150 Mio reads would be better.
# For merging the mapping statistics of the single RNA-seq data sets, we need to generate a file with information on
# which samples should get merged. Per line: original sample namename of new merged sample. As we work with
# paired-end reads, the file name of the reads 1 will be the sample name internally used by vast-tools for each sample.
echo -e "nova2kd1_1\tnova2kd
nova2kd2_1\tnova2kd
nova2kd3_1\tnova2kd
nova2wt1_1\tnova2wt
nova2wt2_1\tnova2wt
nova2wt3_1\tnova2wt
" > merge_samples.txt
#
# do merging
vast-tools merge -g merge_samples.txt
#
# Step 5.3: Estimate inclusion levels and generate final output table, which will be vast_out/INCLUSION_LEVELS_FULL-Mmu8-mm9.tab.
# This table contains the estimated inclusion levels for each single sample and for the merged samples.
vast-tools combine -sp Mmu
#
#
# 6. DOWN-STREAM ANALYSIS WITH MATT
# =================================
#
# The final output table of vast-tools with inclusion levels genome-wide for AS events of type SE, IR, ALT3, ALT5 is:
vts_file=vast_out/INCLUSION_LEVELS_FULL-Mmu8-mm9.tab
# More information on the structure and contents of vast-tools output tables can be found on the vast-tools github website:
# https://github.com/vastgroup/vast-tools
#
# The user can download this vast-tools output table (which needs to be uncompressed) from:
# wget http://matt.crg.eu/big_data/INCLUSION_LEVELS_FULL-Mmu8-mm9.tab.gz
# or on Mac
# curl -OL http://matt.crg.eu/big_data/INCLUSION_LEVELS_FULL-Mmu8-mm9.tab.gz
# gunzip INCLUSION_LEVELS_FULL-Mmu8-mm9.tab.gz
# vts_file=INCLUSION_LEVELS_FULL-Mmu8-mm9.tab
#
# Step 6.1: Inspecting the column names of this table reveals it's contents and column structure. It contains, besides other
# pieces of information, for each AS event its estimated inclusion level for all original samples
# (3 replicates per condition) and for the two merged samples.
matt get_colnms $vts_file
#
# Step 6.2: To get an idea of the values in this table (which shows that, e.g., inclusion levels are in the range of [0;100]),
# it can be printed nicely to screen with (where arrow keys up, down, left, right can be used for navigation):
matt prnt_tab $vts_file -W 10 | less -S -
#
# Step 6.3: To investigate the contents of this table further, one can list and check all distinct entries in column COMPLEX,
# which encode the type of AS event, with:
matt col_uniq $vts_file COMPLEX
#
# Step 6.4: Extract from vast-tools output table skipped exons
# get_vast is a utility command for extracting splicing events from vast-tools output tables
# - complex : restricts retrieved events; vast-tools encodes skipped-exon events with IDs S, C1, C2, C3, MIC
# -a nova2kd : is the merged Nova2 KD sample
# -b nova2wt : is the merged Nova2 WT sample
# -minqab VLOW : vast-tools reports for each estimated PSI value a quality (N,VLOW,LOW,OK,SOK) and we restrict here the output to events to those for which the nova2kd and the nova2wt PSI could be estimated with at least VLOW quality.
# -minqglob N : similar to -minqab but applies to all other samples in the table beside the samples defined with arguments -a and -b.
# Setting it to N prevents the exclusion of events, which have VLOW in -a and -b samples, but worse quality in the other samples.
# -gtf and -f : gene IDs will be extracted automatically
gtf=/path/to/...../Mmu09.gtf
matt get_vast $vts_file -complex S,C1,C2,C3,MIC -a nova2kd -b nova2wt -minqab VLOW -minqglob N -gtf $gtf -f gene_id > exons.tab
#
# Step 6.5: Check the number of events (all are skipped exons) obtained.
# The number of obtained events is different from the overall-number in the vast-tools output table.
# The reason is that some events do not fulfill the minimum-quality constraint as specified with the command get_vast.
matt col_uniq exons.tab COMPLEX
#
# Step 6.6: Check column names of table with retrieved skipped exons
# Applying get_vast as split the information mixed in some columns of vast-tool's output file into distinct columns and has added columns like
# STRAND, flanking borders of flanking exons, GENEID, DPSI_GRPA_MINUS_GRPB with deltaPSIs nova2kd vs. nova2wt, and the number of inclusion- and exclusion reads.
matt get_colnms exons.tab
#
# Step 6.7: Definition of groups of skipped exons to be compared in subsequent analyses.
# The command def_cats outputs a table with one column named GROUP containing the group IDs.
# We specify three groups:
# 'silenced=DPSI_GRPA_MINUS_GRPB[25,100]' : exons silenced by Nova2 have a dPSI in [25,100], i.e., upon Nova2 KD they are more included meaning, under normal conditions, Nova2 would silence them.
# 'enhanced=DPSI_GRPA_MINUS_GRPB[-100,-25]' : exons enhanced by Nova2 have a dPSI in [-100,-25]
# 'unregulated=DPSI_GRPA_MINUS_GRPB[-1,1] PSI_nova2wt[10,90]' : unregulated exons have a dPSI in [-1,1] and a avg. PSI in the WT sample in [10,90], i.e., they are truly alternatively skipped-exons not constitutive.
# All events not falling in any of these groups will get the group ID NA = Not Available.
# Note: Here, piping is applied and the command col_uniq on the output table is used to directly count the number of events per group.
# By this, the user might change the conditions defined and immediately check the number of events per group.
matt def_cats exons.tab GROUP 'silenced=DPSI_GRPA_MINUS_GRPB[25,100]' 'enhanced=DPSI_GRPA_MINUS_GRPB[-100,-25]' 'unregulated=DPSI_GRPA_MINUS_GRPB[-1,1] PSI_nova2wt[10,90]' | matt add_cols exons.tab -
#
# Finally, these group definitions are applied.
# Again, piping is used together with the command add_cols to add the generated GROUP column directly to the file exons.tab.
matt def_cats exons.tab GROUP 'silenced=DPSI[25,100]' 'enhanced=DPSI[-100,-25]' 'unregulated=DPSI[-1,1] PSI_nova2wt[10,90]' | matt add_cols exons.tab -
# Final check of number events per group in exons.tab
matt col_uniq exons.tab GROUP
#
#
# 6.8 Feature extraction of exons and discriminative feature detection
# ... which includes automatic retrieval of genomic sequences and detection of up/down-stream introns and exons.
# Therefore, the user needs to specify a FASTA file with chromosome sequences of mouse (Mmu09) and a genome annotation from Ensembl.
# Sequence IDs therein must match to sequence IDs in exons.tab, which are Ensembl gene IDs as this table was generated by vast-tools.
# For ease of this example, users can download the Mmu09 chromosomes and gene annotation here:
# Linux: wget http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz (700 MB)
# wget http://matt.crg.eu/big_data/Mmu09.gtf.gz (15 MB)
# Mac : curl -OL http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz
# curl -OL http://matt.crg.eu/big_data/Mmu09.gtf.gz
# gunzip Mmu09_gDNA.fasta.gz
# gunzip Mmu09.gtf.gz
fasta=/path/to/...../Mmu09_gDNA.fasta
gtf=/path/to/...../Mmu09.gtf
output_dir=cmpr_exons
matt cmpr_exons exons.tab START END SCAFFOLD STRAND GENEID $gtf $fasta Mmus 150 GROUP[silenced,enhanced,unregulated] $output_dir -colors:deepskyblue,firebrick4,lightgreen
# Infos: the output folder contains a
# 1.) summary PDF report with box plots (summary.pdf)
# 2.) a table with all events and extracted features (exons_with_efeatures.tab)
# 3.) a table with all p values and information on performed statistical tests (overview_of_features_and_comparisons.tab)
# 4.) all box plots as PDFs in directory summary_graphics
# More infos: This analysis can be done with all input tables describing exons, wherever they come from, if they contain the necessary pieces of information:
# 1.) exon start coordinate
# 2.) exon end
# 3.) scaffold/chromosome of exon
# 4.) strand
# 5.) any type of gene ID which matches the gene IDs in the given GTF file with gene annotation
# 6.) group IDs
#
# 6.9 Generate motif RNA-maps for all CISBP-RNA consensus motifs comparing the three exon groups
# A FASTA file with chromosome sequences of mouse (Mmu09) must be given as sequences of the exons and flanking introns
# need to be retrieved for generating a motif RNA-map. Sequence IDs therein must match sequence IDs in exons.tab.
fasta=/path/to/...../Mmu09_gDNA.fasta
output_dir=cisbprna_maps
matt rna_maps_cisbp exons.tab UPSTRM_EX_BORDER START END DOSTRM_EX_BORDER SCAFFOLD STRAND GROUP[silenced,enhanced,unregulated] 31 35 135 $fasta cisbprna_regexps -d $output_dir
# Infos: the output folder contains a
# 1.) summary with all motif RNA-maps (0_all_motif_rna_maps.pdf); each motif RNA-map is printed twice: with and without a data coverage plot.
# The maps are ordered according to differences of the enrichment scores between groups
# (silenced vs. unregulated and enhanced vs unregulated) from most positive differences to most negative differences.
# Hence, maps with most differences come first and last.
# 2.) all motif RNA-maps as PDFs
# More infos: This analysis can be done with all input tables describing exons, wherever they come from, if they contain the necessary pieces of information:
# 1.) exon start coordinate
# 2.) exon end
# 3.) scaffold/chromosome of exon
# 4.) strand
# 5.) flanking borders of flanking exons
# 6.) group IDs
#
# 6.10 Generate a Nova2 RNA-map with a Nova2 regular expression derived from known Nova2 binding sites
# To generate a motif RNAmap with a Nova2 binding motif, first a table describing this motif must be created.
# As the Matt command rna_maps accepts motifs defined by Perl regular expressions and position weight matrices (PWMs), the
# table needs to have 5 columns including THRESH and BGMODEL which only apply for PWMs. As the Nova2 motif is a
# regular expression, the value NA is put as value in these two columns.
echo -e "TYPE\tNAME\tREGEXP\tTHRESH\tBGMODEL
REGEXP\tNOVA2\t[CT]CA[CT](.{0,3}[CT]{0,1}CA[CT]|.{4,23}[CT]CA[CT]){2}\tNA\tNA
" > nova2_regexp.tab
# The necessary chromosome sequence for Mmu09 can be downloaded here:
# Linux: wget http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz (700 MB)
# Mac : curl -OL http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz
fasta=/path/to/...../Mmu09_gDNA.fasta
output_dir=nova2_map
# By specifying the argument -frd 0.01 10000, p-values will be estimated for the motif-enrichment differences between
# silenced vs. unregulated and enhanced vs. regulated with a permutation test (1000 permutations) and regions in the
# motif RNA-map with significant differences will be highlighted according to a FDR threshold of 0.01.
# This call runs 1-2 hours due to the permutation test.
matt rna_maps exons.tab UPSTRM_EX_BORDER START END DOSTRM_EX_BORDER SCAFFOLD STRAND GROUP[silenced,enhanced,unregulated] 31 35 135 $fasta nova2_regexp.tab TYPE NAME REGEXP THRESH BGMODEL -d $output_dir -fdr 0.01 1000
#
# 6.11 Extract 50 nt long sequences upstream of exons
# For Extracting sequences Matt offers the utility command get_seqs, which can be invoked in three different modi.
# The second modus is designed for extracting upstream or downstream sequences of AS events, like exons.
# Again, a FASTA file with chromosome sequences is necessary, and can be downloaded here:
# Linux: wget http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz (700 MB)
# Mac : curl -OL http://matt.crg.eu/big_data/Mmu09_gDNA.fasta.gz
fasta=/path/to/...../Mmu09_gDNA.fasta
# Sequences from strand - will be reverse complemented automatically.
# Piping allows to add the generated two columns with sequences (SEQ_UP and SEQ_DOWN) directly to the exon table.
matt get_seqs exons.tab START END 50 50 SCAFFOLD STRAND $fasta | matt add_cols exons.tab -
#
# Checking extended table; It has two more columns: SEQ_UP and SEQ_DOWN containing the 50 nt long sequences extracted upstream and downstream of the exons.
matt get_colnms exons.tab
#
# 6.12 Enrichment analysis in extracted 50 nt long upstream sequences with all CISBP-RNA motifs
# This command outputs results in form of a table to STDOUT which will be redirected into a file.
output_file=cisbprna_enrichment_test.tab
# For estimating p values, it applies a permutation test a user-specified number of iterations, here 1000.
# Comparisons done are specified by GROUP[silenced,unregulated,enhanced,unregulated], meaning silenced vs. unregulated and enhanced vs. unregulated
matt test_cisbp_enrich exons.tab SEQ_UP GROUP[silenced,unregulated,enhanced,unregulated] 1000 > $output_file
#