### This appendix gives the code used for bioinformatic analyses associated with Bull et al 2023 "Different rearing environments result in minor, but detectable, differences in selection at later life history stages in hatchery and wild coho salmon (Oncorhynchus kisutch) sharing a common environment." ### Section 1 - Quality control, trimming and mapping of reads for each of the three read sets. Run for each sample for each read set (read set = sequencing run). ### Section 1.1 - Quality control ## Load required modules module purge module load fastqc/0.11.8 ## Copy data to scratch (480GB SSD per node) and navigate there cp $inputdir/*_R1.fastq.gz $SLURM_TMPDIR cp $inputdir/*_R2.fastq.gz $SLURM_TMPDIR cd $SLURM_TMPDIR ## Run fastqc for each set of reads mkdir 1_FastQC for i in *.fastq.gz;do fastqc $i -o $SLURM_TMPDIR/1_FastQC/ done ## Run multiQC to summarize FastQC output cd 1_FastQC source $multiqc_virtualenv # <- virtual environment with multiqc and umi_tools installed multiqc . multiqc *R1* -n multiqc_report_R1 multiqc *R2* -n multiqc_report_R2 deactivate ## Copy analyses back to storage cp -r . $inputdir/ ### Section 1.2 - UMI extraction ## Load required modules module purge module load python ## Load virtual environment source $multiqc_virtualenv # <- virtual environment with multiqc and umi_tools installed ## Perform UMI_TOOLS extraction cd $SLURM_TMPDIR cp $inputdir/ids* $SLURM_TMPDIR n=1 while IFS= read -r line;do # Extract sample name j=$(echo $line | cut -d '_' -f1) k=$(echo $j | cut -d '.' -f5) echo "starting sample $n id $j" # Define input files read1=$j\_R1.fastq.gz read2=$j\_R2.fastq.gz umis_full=$(ls $inputdir/UMIs.2/*$k*) umis_short=$(echo $umis_full | cut -d '/' -f11) cp $inputdir/*$j* $SLURM_TMPDIR cp $umis_full $SLURM_TMPDIR # Process read 1 umi_tools extract -I $umis_short --bc-pattern=NNNNNN --read2-in=$read1 --stdout=out.out.gz --read2-out=${read1%.*.*}.UMI.fastq.gz cp ${read1%.*.*}.UMI.fastq.gz $inputdir # Process read 2 umi_tools extract -I $umis_short --bc-pattern=NNNNNN --read2-in=$read2 --stdout=out.out.gz --read2-out=${read2%.*.*}.UMI.fastq.gz cp ${read2%.*.*}.UMI.fastq.gz $inputdir # Clean up tmpdir for next sample and advance counter rm *$k* rm out.out.gz n=$(($n+1)) done < ids01 # <- plain text file with names of read 1s files to process. deactivate ### Section 1.3 - Dediversification and trimming ## Load required modules module purge module load nixpkgs/16.09 module load python/2.7.14 ## Do the trimming mkdir $inputdir/TrimGaloreReports cd $SLURM_TMPDIR ## Extract sample name and define read 1 and read 2 files $j=SAMPLENAME read1=$j\_R1.UMI.fastq.gz read2=$j\_R2.UMI.fastq.gz ## Copy relevant sample to local node cp $inputdir/*$j*UMI* $SLURM_TMPDIR ### Initial trimming #this filters based on 35bp minimum length and quality of phred 20 echo "### Step 1 - initial trim - ID $j ###" source $cutadapt_virtualenv # <- virtual environment with cutadapt installed perl $trimgalore --paired --length 35 --retain_unpaired --nextseq 20 -a AGATCGGAAGAGC -a2 AAATCAAAAAAAC $read1 $read2 deactivate #renaming, 1U/2U ided as "group1" to seperate from unpaired reads generated later in this pipeline step mv $j\_R1.UMI_val_1.fq.gz $j\.UMI.trimgalore.1P.fastq.gz mv $j\_R2.UMI_val_2.fq.gz $j\.UMI.trimgalore.2P.fastq.gz mv $j\_R1.UMI_unpaired_1.fq.gz $j\.UMI.trimgalore.1U_group1.fastq.gz mv $j\_R2.UMI_unpaired_2.fq.gz $j\.UMI.trimgalore.2U_group1.fastq.gz ### Trim off diversity bases (needs to come after trimming as will also be on 3' end of read when read-through occurs) - loses reads with Ns in CGG/TGG site but <1% in checked files ## Reads still paired echo "### Step 2.1 - dediversify - paired reads - ID $j ###" python $diversitytrimscript -1 $j.UMI.trimgalore.1P.fastq.gz -2 $j.UMI.trimgalore.2P.fastq.gz # <- dediversification script provided by NuGen mv $j.UMI.trimgalore.1P.fastq_trimmed.fq.gz $j.UMI.trimgalore.dediversified.1P.fastq.gz mv $j.UMI.trimgalore.2P.fastq_trimmed.fq.gz $j.UMI.trimgalore.dediversified.2P.fastq.gz echo "### Step 2.2 - dediversify - unpaired foreward reads - ID $j ###" ## Forward only reads python $diversitytrimscript -1 $j.UMI.trimgalore.1U_group1.fastq.gz mv $j.UMI.trimgalore.1U_group1.fastq_trimmed.fq.gz $j.UMI.trimgalore.dediversified.1U_group1.fastq.gz ## Reverse only reads - need to dummy forward reads to get the script to handle these echo "### Step 2.3 - dediversify - unpaired reverse reads - ID $j ###" cp $j.UMI.trimgalore.2U_group1.fastq.gz $j.UMI.trimgalore.2U_group1.2.fastq.gz gunzip $j.UMI.trimgalore.2U_group1.2.fastq.gz grep @ $j.UMI.trimgalore.2U_group1.2.fastq > temp01 ### "temp##" NEEDS TO BE UNIQUE PER SCRIPT FOR THIS TO WORK ### rm $j.UMI.trimgalore.2U_group1.2.fastq while IFS= read -r line; do echo $line >> $j.UMI.trimgalore.2U_group1.dummy.fastq; echo "CGGTTTTTTTTTTTTTTT" >> $j.UMI.trimgalore.2U_group1.dummy.fastq; echo "+" >> $j.UMI.trimgalore.2U_group1.dummy.fastq; echo "HHHHHHHHHHHHHHHHHH" >> $j.UMI.trimgalore.2U_group1.dummy.fastq; done < temp01 gzip $j.UMI.trimgalore.2U_group1.dummy.fastq python $diversitytrimscript -1 $j.UMI.trimgalore.2U_group1.dummy.fastq.gz -2 $j.UMI.trimgalore.2U_group1.fastq.gz mv $j.UMI.trimgalore.2U_group1.fastq_trimmed.fq.gz $j.UMI.trimgalore.dediversified.2U_group1.fastq.gz rm *dummy* ### Secondary trimming (accounts for bases lost to dediversification), paired reads and unpaired reads, and hard trims 3 bases off the end of each read echo "### Step 3.1 - secondary trim - paired reads - ID $j ###" source $cutadapt_virtualenv perl $trimgalore --paired --length 35 --nextseq 20 --retain_unpaired --three_prime_clip_R1 3 --three_prime_clip_R2 3 $j.UMI.trimgalore.dediversified.1P.fastq.gz $j.UMI.trimgalore.dediversified.2P.fastq.gz #renaming, 1U/2U ided as "group2" to seperate from unpaired reads generated earlier in this pipeline step mv $j.UMI.trimgalore.dediversified.1P_val_1.fq.gz $j.UMI.trimgalore2.dediversified.1P.fastq.gz mv $j.UMI.trimgalore.dediversified.2P_val_2.fq.gz $j.UMI.trimgalore2.dediversified.2P.fastq.gz mv $j.UMI.trimgalore.dediversified.1P_unpaired_1.fq.gz $j.UMI.trimgalore2.dediversified.1U_group2.fastq.gz mv $j.UMI.trimgalore.dediversified.2P_unpaired_2.fq.gz $j.UMI.trimgalore2.dediversified.2U_group2.fastq.gz echo "### Step 3.2 - secondary trim - unpaired foreward reads - ID $j ###" perl $trimgalore --length 35 --nextseq 20 --three_prime_clip_R1 3 --three_prime_clip_R2 3 $j.UMI.trimgalore.dediversified.1U_group1.fastq.gz mv $j.UMI.trimgalore.dediversified.1U_group1_trimmed.fq.gz $j.UMI.trimgalore2.dediversified.1U_group1.fastq.gz echo "### Step 3.2 - secondary trim - unpaired foreward reads - ID $j ###" perl $trimgalore --length 35 --nextseq 20 --three_prime_clip_R1 3 --three_prime_clip_R2 3 $j.UMI.trimgalore.dediversified.2U_group1.fastq.gz mv $j.UMI.trimgalore.dediversified.2U_group1_trimmed.fq.gz $j.UMI.trimgalore2.dediversified.2U_group1.fastq.gz deactivate ### Copy output back echo "### Copying output back to cluster - ID $j ###" cp $j.UMI.trimgalore2.dediversified.1P.fastq.gz $inputdir cp $j.UMI.trimgalore2.dediversified.2P.fastq.gz $inputdir cp $j.UMI.trimgalore2.dediversified.1U_group2.fastq.gz $inputdir cp $j.UMI.trimgalore2.dediversified.2U_group2.fastq.gz $inputdir cp $j.UMI.trimgalore2.dediversified.1U_group1.fastq.gz $inputdir cp $j.UMI.trimgalore2.dediversified.2U_group1.fastq.gz $inputdir cp *$j*trimming_report* $inputdir/TrimGaloreReports ## Clean up scratch cd $SLURM_TMPDIR rm -r * ### Section 1.4 - Quality control post trimming ## Load required modules module purge module load fastqc/0.11.8 ## Copy data to scratch (480GB SSD per node) and navigate there cp $inputdir/*UMI.trimgalore2.dediversified.1P.fastq.gz $SLURM_TMPDIR cp $inputdir/*UMI.trimgalore2.dediversified.2P.fastq.gz $SLURM_TMPDIR cd $SLURM_TMPDIR ## Run fastqc for each set of reads mkdir 1_FastQC for i in *.fastq.gz;do fastqc $i -o $SLURM_TMPDIR/1_FastQC/ done ## Run multiQC to summarize FastQC output cd 1_FastQC source $multiqc_virtualenv # <- virtual environment with multiqc and umi_tools installed multiqc . multiqc *P1* -n multiqc_report_P1 multiqc *P2* -n multiqc_report_P2 deactivate ## Copy analyses back to storage cp -r . $inputdir/ ### Section 1.5 - Align reads to reference genome j=???? # <- define which sample to run this for ref_coho_dir=???? # <- define location of reference genome ## Load required modules module purge module load nixpkgs/16.09 module load gcc/7.3.0 module load bismark/0.22.1 module load bowtie2 module load samtools/1.9 ## Copy data to local node (480GB SSD per node) cp $inputdir/$j.UMI.trimgalore2.dediversified.1P.fastq.gz $SLURM_TMPDIR cp $inputdir/$j.UMI.trimgalore2.dediversified.2P.fastq.gz $SLURM_TMPDIR ## Loop over sample to perform alignment cd $SLURM_TMPDIR echo "starting sample $j" read1P=$j.UMI.trimgalore2.dediversified.1P.fastq.gz read2P=$j.UMI.trimgalore2.dediversified.2P.fastq.gz ## Do alignment of paired reads, in all cases output bam files are sorted to allow merging. Rename and copy back to work directory perl $EBROOTBISMARK/bismark --genome $ref_coho_dir -1 $read1P -2 $read2P samtools sort $j.UMI.trimgalore2.dediversified.1P_bismark_bt2_pe.bam -o $j.UMI.trimgalore2.dediversified.1P_bismark_bt2_pe.sorted.bam mv $j.UMI.trimgalore2.dediversified.1P_bismark_bt2_pe.sorted.bam $j.UMI.trimgalore2.dediversified.PE.bismark.bam mv $j.UMI.trimgalore2.dediversified.1P_bismark_bt2_PE_report.txt $j.UMI.trimgalore2.dediversified.PE.bismark_report.txt cp $j.UMI.trimgalore2.dediversified.PE.bismark.bam $inputdir cp $j.UMI.trimgalore2.dediversified.PE.bismark_report.txt $inputdir/5_AlignmentReports ## Clean up scratch cd $SLURM_TMPDIR rm -r * ### Section 2 - Combine reads across sequencing runs for each sample, deduplication, and score SNPS. ### Section 2.1 - Combining reads for each sample. Run once. Loops over all samples inputdirHS=???? # <- directory containing processed reads from sequencing run 1 inputdirNS2020=???? # <- directory containing processed reads from sequencing run 2 inputdirNS2021=???? # <- directory containing processed reads from sequencing run 3 inputdirALL3=???? # <- directory to write out combined reads ## Set up environment module purge module load python module load samtools ## Loop over samples and run code n=1 cd $SLURM_TMPDIR for i in $inputdirHS/*.UMI.trimgalore2.dediversified.PE.bismark.bam; do j=$(echo $i | cut -d '/' -f9) k=${j%.*.*.*.*.*.*} l=$(echo $k | cut -d '.' -f5) echo "starting sample $n id $k" cp $inputdirHS/*.$l.UMI.trimgalore2.dediversified.PE.bismark.bam $SLURM_TMPDIR/HS.bam cp $inputdirNS2020/*.$l.UMI.trimgalore2.dediversified.PE.bismark.bam $SLURM_TMPDIR/NS2020.bam cp $inputdirNS2021/*.$l.UMI.trimgalore2.dediversified.PE.bismark.bam $SLURM_TMPDIR/NS2021.bam date samtools merge -fr ALL3.bam HS.bam NS2020.bam NS2021.bam samtools sort -o $l.ALL3.sorted.bam ALL3.bam date cp $l.ALL3.sorted.bam $inputdirALL3 rm $l.ALL3.sorted.bam rm ALL3.bam rm HS.bam rm NS2020.bam rm NS2021.bam n=$(($n+1)) done ## Clean up scratch cd $SLURM_TMPDIR rm -r * ### Section 2.2 - Deduplicate reads using UMIs. Run for each sample j=???? # <- bam file from last step k=???? # <- short sample ID ## Set input and output directories inputdirALL3=???? # <- directory with combined read bam files ## Set up environment module purge module load python module load samtools module load bedtools ## Set up to run code cd $SLURM_TMPDIR cp $inputdirALL3/$j $SLURM_TMPDIR source $multiqcdir/ENV/bin/activate # <- virtual environment with multiqc and umi_tools installed # Process paired reads. First convert to sam then use sed to remove barcode IDS from read names (interfers with umi_tools) and then convert back to bam (using samtools sort). Index bam and then run deduplicating samtools view -h -o $k.ALL3.sorted.sam $j sed -i 's/_.:N:0:........//g' $k.ALL3.sorted.sam samtools sort -o $j $k.ALL3.sorted.sam rm $k.ALL3.sorted.sam samtools index $j umi_tools dedup -I $j --paired -S $k.ALL3.deduped.bam --output-stats=$k.ALL3.deduped.stats samtools index $k.ALL3.deduped.bam # Calculate coverage (deduped) bedtools genomecov -ibam $k.ALL3.deduped.bam > $k.ALL3.deduped.depth grep -w genome $k.ALL3.deduped.depth > $k.ALL3.deduped.depth_genome # Copy output back cp $k.ALL3.deduped.bam $inputdirALL3 cp $k.ALL3.deduped.bam.bai $inputdirALL3 cp $k.ALL3.deduped.stats_* $inputdirALL3/2_All3_DedupReports+Depths cp $k.ALL3.deduped.depth $inputdirALL3/2_All3_DedupReports+Depths cp $k.ALL3.deduped.depth_genome $inputdirALL3/2_All3_DedupReports+Depths ## Close virtual environment deactivate ## Clean up scratch cd $SLURM_TMPDIR rm -r * ### Section 2.3 - Call SNPs using bssnper2. Loops over samples inputdirALL3=???? # <- directory with deduplicated bam files bssnper2=???? # <- bssnper2 install location refnozip=???? # <- unzipped reference genome location ## Load required modules module purge ## Copy BAM files to scratch n=1 cd $SLURM_TMPDIR for i in $inputdirALL3/*.ALL3.deduped.bam; do ## Copy over files and define variablescd cp $i $SLURM_TMPDIR j=$(echo $i | cut -d '/' -f7) k=${j%.*.*.*} ## Print counter and current sample name echo "starting sample $n id $k" ## Run $bssnper2 $j --sampleName=$k --ref $refnozip --vcf $k.bssnper2.vcf --homrefInVCF --minBaseQ=15 --minMapQ=20 --minCoverage=6 --maxCoverage=200 --minHomFreq=0.85 --minHetFreq=0.10 --errorRate=0.02 --minAltCoverage=2 ## Copy output back to server cp $k.bssnper2.vcf $inputdirALL3/3_All3_bssnper2 ## Remove files before next sample rm -f * ## Increase counter n=$(($n+1)) done ### Section 2.4 - Combine per-sample VCF files into one VCF file. inputdirALL3=???? # <- directory with per-sample VCF files ## Load required modules module purge module load nixpkgs/16.09 module load intel/2018.3 module load tabix ## Copy per-sample vcf files to scratch cd $inputdirALL3/3_All3_bssnper2/ ## Run for i in *.bssnper2.vcf;do bgzip $i # ~ 3 min per sample tabix $i.gz # ~ 0.5 min per sample done list=$(ls *.vcf.gz | sed -z 's/\n/ /g') module --force purge module load StdEnv/2020 module load vcftools/0.1.16 vcf-merge $list > Coho.ALL.bssnper2.raw.vcf ## Clean up scratch cd $SLURM_TMPDIR rm -r * ### Section 3 - Filter SNPs and individuals for quality. NOT run as a job, but rather directly on the console as this code runs quickly. module load vcftools/0.1.16 #remove sites scored in less than 50 % of individuals, need to be biallelic vcftools --vcf Coho.ALL.bssnper2.raw.vcf --out Coho.ALL.bssnper2.filt1 --max-missing 0.5 --minDP 10 --maxDP 250 --min-alleles 2 --max-alleles 2 --recode #1515453 SNPs retained #calculate missingness for retained SNPs, and remove those individuals with scores for < 50 % of sites vcftools --vcf Coho.ALL.bssnper2.filt1.recode.vcf --out Coho.ALL.bssnper2.filt1 --missing-indv # check output file manually in excel # REMOVE 5 INDIVIDUALS - G3854, G3856, G3862, G3868, G3871. NEED TO MANUALLY LIST vcftools --vcf Coho.ALL.bssnper2.filt1.recode.vcf --out Coho.ALL.bssnper2.filt2 --remove-indv G3854 --remove-indv G3856 --remove-indv G3862 --remove-indv G3868 --remove-indv G3871 --recode #remove sites scored in less than 75 % of individuals, with maf < 0.05, thin to max one SNP per 400 bp. vcftools --vcf Coho.ALL.bssnper2.filt2.recode.vcf --out Coho.ALL.bssnper2.filt3 --max-missing 0.75 --maf 0.05 --thin 400 --recode #Remove sites with het 0.65 or greater (using -missing-site function in vcftools to count number of scored individuals) #Count number of hets per SNP. file.vcf here is the filt3 file (no hwe, no nosibs) paste <(awk -F"\t" 'BEGIN {print "CHR\tPOS\tREF\tALT"} !/^#/ {print $1"\t"$2"\t"$4"\t"$5}' Coho.ALL.bssnper2.filt3.recode.vcf) \ <(awk 'BEGIN {print "nHet"} !/^#/ {print gsub(/0\|1|1\|0|0\/1|1\/0|0\|2|2\|0|0\/2|2\/0|0\|3|3\|0|0\/3|3\/0|0\|4|4\|0|0\/4|4\/0/, "")}' Coho.ALL.bssnper2.filt3.recode.vcf) > Coho.ALL.bssnper2.filt3.recode.hetcount # 475 SNPs to be removed vcftools --vcf Coho.ALL.bssnper2.filt3.recode.vcf --out Coho.ALL.bssnper2.filt3.nofixed --exclude-positions SNPS_het-over-065 --recode #LD trimming in bcftools (prune function). Calculate LD first for plotting vcftools --vcf Coho.ALL.bssnper2.filt3.nofixed.recode.vcf --out --vcf Coho.ALL.bssnper2.filt3.nofixed --geno-r2 --ld-window-bp 500000 bcftools +prune -m 0.2 -w 500000 Coho.ALL.bssnper2.filt3.nofixed.recode.vcf -Ov -o Coho.ALL.bssnper2.filt3.nofixed.LD.vcf #Check if individuals need filtering, ie < 75% of SNPs scored vcftools --vcf Coho.ALL.bssnper2.filt3.nofixed.LD.vcf --out Coho.ALL.bssnper2.filt3.nofixed.LD --missing-indv #No individuals need to be removed #ID siblings and trim to one individual per family at random vcftools --vcf Coho.ALL.bssnper2.filt3.nofixed.LD.vcf --out Coho.ALL.bssnper2.filt3.nofixed.LD --relatedness #Reduced set of 1 member per family, remove 28 individuals vcftools --vcf Coho.ALL.bssnper2.filt3.nofixed.LD.vcf --out Coho.ALL.bssnper2.filt4.nosibs --remove-indv F163 --remove-indv F35 --remove-indv F41 --remove-indv F45 --remove-indv F48 --remove-indv F50 --remove-indv G3848 --remove-indv G3849 --remove-indv G3850 --remove-indv G3851 --remove-indv G3858 --remove-indv G3859 --remove-indv G3863 --remove-indv G3865 --remove-indv H656 --remove-indv H657 --remove-indv H659 --remove-indv H663 --remove-indv H664 --remove-indv H666 --remove-indv H667 --remove-indv H670 --remove-indv H675 --remove-indv H677 --remove-indv M161 --remove-indv M162 --remove-indv M169 --remove-indv M77 --recode ### 61 individuals, 23643 SNPs #HWE version, because why not. 20318 SNPs retained vcftools --vcf Coho.ALL.bssnper2.filt4.nosibs.recode.vcf --out Coho.ALL.bssnper2.filt4.nosibs.HWE --hwe 0.05 --recode ### Section 4 - Downstream analyses. ### Section 4.1 - BayPass. XTX model inputdir=???? # <- directory with BayPass files produced using PGDSpider2 and geste2baypass.py script. Produce one for each contrast of interest. ## Load required modules module purge module load baypass/2.2 ## Copy BAYPASS file to scratch - NEEDS BAYSCAN FILE CREATED AND THEN CONVERTED USING SCRIPT cp $inputdir/*BAYPASS* $SLURM_TMPDIR cd $SLURM_TMPDIR ## Run n=1 for i in *BAYPASS*;do echo "starting combination $n of 8, file $i" i_baypass -gfile $i -outprefix out_$i cp out_$i* $inputdir n=$(($n+1)) done ## Clean up scratch cd $SLURM_TMPDIR rm -r * ### Section 4.2 - BayPass. BF/contrast model inputdir=???? # <- directory with BayPass files produced using PGDSpider2 and geste2baypass.py script. Produce one for each contrast of interest. ## Load required modules module purge module load baypass/2.2 ## Copy BAYESCAN file to scratch - NEEDS BAYESCAN FILE CREATED FROM VCF USING PGDSpider with population definition file cp $inputdir/Coho.2POP.covariate $SLURM_TMPDIR # Defines the two groups being compares as "0" and "1" for the contrast analyses cp $inputdir/*BAYPASS* $SLURM_TMPDIR cd $SLURM_TMPDIR ## Run n=1 for i in *BAYPASS*;do echo "starting combination $n of 8, file $i" i_baypass -gfile $i -efile Coho.2POP.covariate -outprefix out_covariate_$i cp out_covariate_$i* $inputdir n=$(($n+1)) done ## Clean up scratch cd $SLURM_TMPDIR rm -r * ### Section 4.3 - Structure. Run for K values 1-10 with n = 1-10 reps each. k=1 n=1 ## Set input and output directories inputdir=???? # <- location of filtered dataset. created from combined VCF using PGDSpider2 ## Load required modules module load StdEnv/2020 gcc/9.3.0 module load structure/2.3.4 ## Copy STRUCTURE file to scratch - NEEDS STRUCTURE FILE CREATED FROM VCF USING PGDSpider with population definition file cp $inputdir/project_data_nosibs $SLURM_TMPDIR cp $inputdir/mainparams $SLURM_TMPDIR cp $inputdir/extraparams $SLURM_TMPDIR cd $SLURM_TMPDIR dos2unix * #needed for STRUCTURE to read the input file ## Run structure -i project_data_nosibs -o Coho.ALL.bssnper.filt4.recode.STRUCTURE_k$k\n$n -K $k ## Copy files back to normal storage cp *_f $inputdir ## Clean up scratch cd $SLURM_TMPDIR rm -r * ### Section 4.4 - PCA in adegenet. R CODE #adegenet PCA #clear workspace rm(list=ls()) #set working directory setwd("?????") #install required packages - run first time only #install.packages(c("adegenet","vcfR")) #load required packages library("adegenet") library("vcfR") library("ggplot2") library("cowplot") #import vcf file and convert to genlight format vcf<-read.vcfR("Coho.ALL.bssnper2.filt4.nosibs.recode.vcf") vcf_genlight<-vcfR2genlight(vcf) #convert to tabulated data for pca vcf_tab <- tab(vcf_genlight, freq = TRUE, NA.method = "mean") #perform PCA and plot vcf_pca <- dudi.pca(vcf_tab, scale = FALSE, scannf = FALSE, nf = 3) barplot(vcf_pca$eig[1:50],main = "PCA eigenvalues", col = heat.colors(50)) #barplot of eigenvalues memberships<-read.table("GroupMemberships_bssnper2.csv",header=TRUE,sep=",",fileEncoding="UTF-8-BOM") Group<-memberships$GROUP plot1<-qplot(x=vcf_pca$li[,1],y=vcf_pca$li[,2],geom="point",color=Group,xlab="PC 1 (2.44 %)",ylab="PC 2 (2.40 %)",size=2)+theme_classic()+theme(legend.position="none")+theme(axis.text=element_text(size=16),axis.title=element_text(size=20)) plot2<-qplot(x=vcf_pca$li[,1],y=vcf_pca$li[,3],geom="point",color=Group,xlab="PC 1 (2.44 %)",ylab="PC 3 (2.14 %)",size=2)+theme_classic()+theme(legend.position="none")+theme(axis.text=element_text(size=16),axis.title=element_text(size=20)) plot_grid(plot1,plot2,labels=c("A","B")) ggsave2("combined_nolegend_PERCENTs.jpg",width=10,height=5) ### Section 4.5 - FSTs. R CODE #clear workspace rm(list=ls()) #set working directory setwd("????)") #load required package library("diveRsity") ### run analyses, all SNPS ### #pairwise all 6 groups ### DONE ### out<-diffCalc(infile="Coho.ALL.bssnper2.filt4.nosibs.recode.GENEPOP.6POPS.txt",fst=TRUE,pairwise=TRUE,boots=1000,bs_pairwise=TRUE,para=TRUE) write.csv(out$bs_pairwise$Fst,"bssnper2_diversity_FST_pairwise.csv") write.csv(out$pw_locus,"bssnper2_diversity_FST_pairwise_perSNP.csv") #global all 6 groups ### DONE ### out_global<-diffCalc(infile="Coho.ALL.bssnper2.filt4.nosibs.recode.GENEPOP.6POPS.txt",fst=TRUE,pairwise=FALSE,boots=1000,bs_locus=TRUE,para=TRUE) write.csv(out_global$global_bs,"bssnper2_diversity_FST_global.csv") write.csv(out_global$bs_locus,"bssnper2_diversity_FST_global_perSNP.csv") ### Section 4.6 - Randomization based p-values for "parallelism" tests. R CODE #clear workspace rm(list=ls()) #set working directory setwd("?") #load required package library("diveRsity") library("adegenet") library("vcfR") vcf<-read.vcfR("Coho.ALL.bssnper2.filt4.nosibs.recode.BF100.HEADER.vcf") # VCF file containing the loci being tested vcf_genlight<-vcfR2genlight(vcf) vcf_tab <- tab(vcf_genlight, freq = TRUE,NA.method="asis") GROUPS<-read.table("../GroupMemberships_bssnper2_6POPS.txt",header=TRUE) # Text file defining group memberships cor_coefficients_ET<-numeric(length=1000) # E = enriched (semi-natural in the manuscript), T = traditional (conventional in the manuscript), W = wild) cor_coefficients_EW<-numeric(length=1000) cor_coefficients_TW<-numeric(length=1000) #loop to shuffle rows for(i in 1:1000){ vcf_shuffled<-vcf_tab[sample(1:61),] vcf_shuffled_ER<-vcf_shuffled[1:12,] #12 vcf_shuffled_TR<-vcf_shuffled[13:19,] #7 vcf_shuffled_WR<-vcf_shuffled[20:25,] #WR #6 vcf_shuffled_ES<-vcf_shuffled[26:35,] #ES #10 vcf_shuffled_TS<-vcf_shuffled[36:44,] #TS #9 vcf_shuffled_WS<-vcf_shuffled[45:61,] #WS #17 difsE<-colMeans(vcf_shuffled_ER,na.rm=TRUE)-colMeans(vcf_shuffled_ES,na.rm=TRUE) difsT<-colMeans(vcf_shuffled_TR,na.rm=TRUE)-colMeans(vcf_shuffled_TS,na.rm=TRUE) difsW<-colMeans(vcf_shuffled_WR,na.rm=TRUE)-colMeans(vcf_shuffled_WS,na.rm=TRUE) cor_coefficients_ET[i]<-cor.test(difsE,difsT)$estimate cor_coefficients_EW[i]<-cor.test(difsE,difsW)$estimate cor_coefficients_TW[i]<-cor.test(difsT,difsW)$estimate } #actual values vcf_ER<-vcf_tab[GROUPS[,2]=="Enriched_Adult",] vcf_TR<-vcf_tab[GROUPS[,2]=="Trad_Adult",] vcf_WR<-vcf_tab[GROUPS[,2]=="Wild_Adult",] vcf_ES<-vcf_tab[GROUPS[,2]=="Enriched_Smolt",] vcf_TS<-vcf_tab[GROUPS[,2]=="Trad_Smolt",] vcf_WS<-vcf_tab[GROUPS[,2]=="Wild_Smolt",] difsE_actual<-colMeans(vcf_ER,na.rm=TRUE)-colMeans(vcf_ES,na.rm=TRUE) difsT_actual<-colMeans(vcf_TR,na.rm=TRUE)-colMeans(vcf_TS,na.rm=TRUE) difsW_actual<-colMeans(vcf_WR,na.rm=TRUE)-colMeans(vcf_WS,na.rm=TRUE) cor_coefficients_actual_ET<-cor.test(difsE_actual,difsT_actual)$estimate #0.51 cor_coefficients_actual_EW<-cor.test(difsE_actual,difsW_actual)$estimate #0.49 cor_coefficients_actual_TW<-cor.test(difsT_actual,difsW_actual)$estimate #0.44 max(cor_coefficients_ET) # 0.34 max(cor_coefficients_EW) # 0.36 max(cor_coefficients_TW) # 0.45 quantile(cor_coefficients_ET,0.95) quantile(cor_coefficients_EW,0.95) quantile(cor_coefficients_TW,0.95) # 0.0.192