1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
| suppressMessages(suppressWarnings(library(dplyr))) suppressMessages(suppressWarnings(library(vegan))) suppressMessages(suppressWarnings(library(doParallel))) suppressMessages(suppressWarnings(library(foreach))) suppressMessages(suppressWarnings(library(mgcv))) suppressMessages(suppressWarnings(library(reshape2))) suppressMessages(suppressWarnings(library(ggplot2))) suppressMessages(suppressWarnings(library(cowplot))) suppressMessages(suppressWarnings(library(Rcpp))) suppressMessages(suppressWarnings(library(RcppArmadillo)))
wkdir <- dirname(rstudioapi::getActiveDocumentContext()$path) setwd(wkdir)
metadata_file = "data/group_soiltow.csv" count_matrix = "data/water_soil.csv"
metadata <- read.csv(file = metadata_file, header = TRUE, sep = ",", row.names = 1) otus <- read.csv(file = count_matrix, header = TRUE, sep = ",", row.names = 1) otus <- t(as.matrix(otus))
EM_iterations = 1000
different_sources_flag = 0
source("FEAST_src/src.R")
common.sample.ids <- intersect(rownames(metadata), rownames(otus)) otus <- otus[common.sample.ids, ] metadata <- metadata[common.sample.ids, ]
if (length(common.sample.ids) <= 1) { message <- paste(sprintf('Error: there are %d sample ids in common '), 'between the metadata file and data table') stop(message) }
if (different_sources_flag == 0) { metadata$id[metadata$SourceSink == 'Source'] = NA metadata$id[metadata$SourceSink == 'Sink'] = c(1:length(which(metadata$SourceSink == 'Sink'))) }
envs <- metadata$Env Ids <- na.omit(unique(metadata$id)) Proportions_est <- list()
set.seed(1234175)
for(it in 1:length(Ids)){ if(different_sources_flag == 1){ train.ix <- which(metadata$SourceSink == 'Source' & metadata$id == Ids[it]) test.ix <- which(metadata$SourceSink == 'Sink' & metadata$id == Ids[it]) } else { train.ix <- which(metadata$SourceSink == 'Source') test.ix <- which(metadata$SourceSink == 'Sink' & metadata$id == Ids[it]) } num_sources <- length(train.ix) COVERAGE = min(rowSums(otus[c(train.ix, test.ix), ])) str(COVERAGE) sources <- as.data.frame(as.matrix(rarefy(as.data.frame(otus[train.ix,]), COVERAGE))) sinks <- as.data.frame(as.matrix(rarefy(as.data.frame(t(as.matrix(otus[test.ix,]))), COVERAGE))) print(paste("Number of OTUs in the sink sample = ",length(which(sinks > 0)))) print(paste("Seq depth in the sources and sink samples = ",COVERAGE)) print(paste("The sink is:", envs[test.ix])) FEAST_output <- FEAST(source = sources, sinks = t(sinks), env = envs[train.ix], em_itr = EM_iterations, COVERAGE = COVERAGE) Proportions_est[[it]] <- FEAST_output$data_prop[, 1] names(Proportions_est[[it]]) <- c(as.character(envs[train.ix]), "unknown") if(length(Proportions_est[[it]]) < num_sources + 1){ tmp = Proportions_est[[it]] Proportions_est[[it]][num_sources] = NA Proportions_est[[it]][num_sources + 1] = tmp[num_sources] } print("Source mixing proportions") print(Proportions_est[[it]]) }
res = data.frame(Proportions_est) print(1 - mean(unlist(res[nrow(res), ])))
write.csv(Proportions_est, file = "res.csv", quote = FALSE)
|