plot.tag.cluster.counts = function(count.col.name, cluster.filename) { cols = brewer.pal(9,"Set1")[-6]; # -6 takes out yellow x.label = "Outer count"; if (count.col.name == "inCount") { x.label = "Inner count"; } stat1.stim.hg18.clusters = as.matrix(read.delim(cluster.filename, header=T)); hg18.t = table(stat1.stim.hg18.clusters[,count.col.name]); #out.count.hg18.mtx = cbind(as.numeric(names(hg18.t)), as.vector(hg18.t)); # overlap number, frequency count.hg18.mtx = cbind(as.numeric(names(hg18.t)), as.vector(hg18.t)/sum(as.vector(hg18.t))); # overlap number, proportion plot(count.hg18.mtx, ty="n", log="xy", xlim=c(1,2000), xlab=x.label, ylab="Density"); chrs = unique(stat1.stim.hg18.clusters[,1]); for (i in 1:length(chrs)) { chr = chrs[i]; clusters.in.chr = stat1.stim.hg18.clusters[stat1.stim.hg18.clusters[,1]==chr, ]; chr.t = table(clusters.in.chr[,count.col.name]); #out.count.chr.mtx = cbind(as.numeric(names(chr.t)), as.vector(chr.t)); # overlap number, frequency count.chr.mtx = cbind(as.numeric(names(chr.t)), as.vector(chr.t)/sum(as.vector(chr.t))); # overlap number, proportion line.col = "gray"; #if (i == 22) line.col = "red"; lines(count.chr.mtx, ty="l", col=line.col); # reuse the colors } lines(count.hg18.mtx, ty="l"); legend("topright", bty="n", lty=1, legend=c("Whole genome", "Individual chromosomes"), col=c("black", "gray")); } # ==================== simulation functions ======================= get.simu.tag.counts.tally.mtx = function(count.col.name, file.name.base.start, file.name.base.end, n.rep) { simu.tag.counts.tally.list = vector("list", n.rep); simu.tag.counts.maxes = numeric(); for (i in 1:n.rep) { cat(i, "\n"); flush.console(); file.name = paste(file.name.base.start, i, file.name.base.end, sep=""); simu.clusters = as.matrix(read.delim(file.name, header=T)); simu.tag.counts = simu.clusters[,count.col.name]; simu.tag.counts.tally.list[[i]] = table(simu.tag.counts); simu.tag.counts.maxes = c(simu.tag.counts.maxes, max(simu.tag.counts)); } simu.tag.counts.max = max(simu.tag.counts.maxes); # simu.tag.counts.tally.mtx = matrix(0, nrow=n.rep, ncol=max(simu.tag.counts.max)); colnames(simu.tag.counts.tally.mtx) = 1:simu.tag.counts.max; for (i in 1:n.rep) { simu.tag.counts.tally.mtx[i,names(simu.tag.counts.tally.list[[i]])] = simu.tag.counts.tally.list[[i]]; } return(simu.tag.counts.tally.mtx); } assign.p.values.to.real.tag.counts = function(simu.tag.counts.tally.sums, real.tag.counts) { p.values = 0*simu.tag.counts.tally.sums; simu.tag.counts.total = sum(simu.tag.counts.tally.sums); for (i in 1:length(simu.tag.counts.tally.sums)) { p.values[i] = sum(simu.tag.counts.tally.sums[i:length(simu.tag.counts.tally.sums)])/simu.tag.counts.total; } simu.tag.counts = as.numeric(names(simu.tag.counts.tally.sums)); real.tag.count.p.values = 0*real.tag.counts; for (i in 1:length(real.tag.counts)) { real.tag.count = real.tag.counts[i]; if (is.element(real.tag.count, simu.tag.counts)) { real.tag.count.p.values[i] = p.values[as.character(real.tag.count)]; } else { real.tag.count.p.values[i] = 0; } } real.tag.count.mtx = cbind(real.tag.counts, real.tag.count.p.values, p.adjust(real.tag.count.p.values, "fdr")); colnames(real.tag.count.mtx) = c("tag.count", "p-value", "fdr-p-value"); return(real.tag.count.mtx); } plot.real.simu.tally = function(simu.outer.tag.counts.tally, simu.outer.tag.counts.tally.sds, real.outer.tag.counts.tally) { simu.outer.tag.counts = as.numeric(names(simu.outer.tag.counts.tally)); real.outer.tag.counts = as.numeric(names(real.outer.tag.counts.tally)); simu.outer.tag.counts.min = min(simu.outer.tag.counts); simu.outer.tag.counts.max = max(simu.outer.tag.counts); y.max = simu.outer.tag.counts.tally[as.character(simu.outer.tag.counts.min)] + simu.outer.tag.counts.tally.sds[as.character(simu.outer.tag.counts.min)]; y.min = simu.outer.tag.counts.tally[as.character(simu.outer.tag.counts.max)] - simu.outer.tag.counts.tally.sds[as.character(simu.outer.tag.counts.max)]; x.max = max(real.outer.tag.counts); plot(simu.outer.tag.counts, simu.outer.tag.counts.tally, xlim=c(1, x.max), ylim=c(0.05, y.max), ty="n", log="xy"); lines(simu.outer.tag.counts, simu.outer.tag.counts.tally, col="gray", ty="o"); lines(real.outer.tag.counts, real.outer.tag.counts.tally, ty="o"); } plot.cluster.dists = function(count.col.name, cluster.file.names, legend=NULL, output="screen", output.file.name=NULL) { if (output == "pdf") { pdf(file=output.file.name, width=6, height=6, title=""); } if (output == "png") { png(filename=output.file.name, width=800, height=800, pointsize=16, bg = "white", res = NA, restoreConsole=TRUE); } cols = rep(brewer.pal(9,"Set1")[-6],2); # -6 takes out yellow for (i in 1:length(cluster.file.names)) { cluster.file.name = cluster.file.names[i]; cat("Parsing", cluster.file.name, "\n"); flush.console(); clusters = as.matrix(read.delim(cluster.file.name, header=T)); cluster.t = table(clusters[,count.col.name]); cluster.t = cbind(as.numeric(names(cluster.t)), as.vector(cluster.t)); # overlap number, frequency if (cluster.file.name == "stat1-stim_hg18_tag-clusters.txt") { plot(cluster.t, ty="l", log="xy", xlab="Tag count", ylab="Frequency", xlim=c(1,10), ylim=c(10000,1200000)); } else { lines(cluster.t, ty="l", col=cols[i]); }; } legend("topright",legend=legend, bty="n", lty=1, col=c("black", cols[2:5]), cex=0.8); if (output != "screen") { dev.off(); } } plot.simu.groups = function(count.col.name, real.cluster.filename, site.lens, num.sites, site.prob.increase.factors, frgd.inter.prfls, frgd.inter.prfl.pars.list, frgd.intra.prfls, bkgd.distrs, bkgd.distr.pars.list, # default bkgd.distr.pars including 'unif' tag.lens, # tag.lens: a vector of tag lengths and the length of the vector is the number of tags file.name.base.start, file.name.base.end, output="screen" ) { # the real overlap count real.clusters = as.matrix(read.delim(real.cluster.filename, header=T)); real.t = table(real.clusters[,count.col.name]); #real.overlap.counts = cbind(as.numeric(names(real.t)), as.vector(real.t)/sum(as.vector(real.t))); # overlap number, proportion real.overlap.counts = cbind(as.numeric(names(real.t)), as.vector(real.t)); # overlap number, frequency cols = rep(brewer.pal(9,"Set1")[-6],2); # -6 takes out yellow for (site.len in site.lens) { for (frgd.intra.prfl in frgd.intra.prfls) { for (frgd.inter.prfl in frgd.inter.prfls) { frgd.inter.prfl.pars = frgd.inter.prfl.pars.list[[frgd.inter.prfl]]; if (frgd.inter.prfl == "binom") { frgd.inter.prfl.pars = c(50, 0.01, 0.1); } for (bkgd.distr in bkgd.distrs) { #bkgd.distr.pars = c(); # default including 'unif' #if (bkgd.distr == "gamma") { bkgd.distr.pars = c(2, 3); } #if (bkgd.distr == "lnorm") { bkgd.distr.pars = c(1, 1); } bkgd.distr.pars = bkgd.distr.pars.list[[bkgd.distr]]; if (output == "pdf" | output == "png") { output.file.name = ""; if (output == "pdf") { output.file.name = paste(num.sites, count.col.name, paste(c(frgd.inter.prfl, frgd.inter.prfl.pars), collapse="-"), frgd.intra.prfl, paste(c(bkgd.distr, bkgd.distr.pars), collapse="-"), ".pdf", sep="_"); pdf(file=output.file.name, width=6, height=6, title=""); } if (output == "png") { output.file.name = paste(num.sites, count.col.name, paste(c(frgd.inter.prfl, frgd.inter.prfl.pars), collapse="-"), frgd.intra.prfl, paste(c(bkgd.distr, bkgd.distr.pars), collapse="-"), ".png", sep="_"); png(filename=output.file.name, width=800, height=800, pointsize=16, bg = "white", res = NA, restoreConsole=TRUE); } cat(output.file.name, "\n"); flush.console(); } main.title = paste(num.sites, count.col.name, paste(c(frgd.inter.prfl, frgd.inter.prfl.pars), collapse="-"), frgd.intra.prfl, paste(c(bkgd.distr, bkgd.distr.pars), collapse="-"), sep=" / "); plot(real.overlap.counts, ty="l", log="xy", xlab="Tag count", ylab="Frequency", main=main.title); for (i in 1:length(site.prob.increase.factors)) { site.prob.increase.factor = site.prob.increase.factors[i]; simu.cluster.filename = paste(file.name.base.start, paste(site.len, num.sites, sep="-"), site.prob.increase.factor, paste(c(frgd.inter.prfl, frgd.inter.prfl.pars), collapse="-"), frgd.intra.prfl, paste(c(bkgd.distr, bkgd.distr.pars), collapse="-"), file.name.base.end, sep="_"); simu.clusters = as.matrix(read.delim(simu.cluster.filename, header=T)); simu.t = table(simu.clusters[,count.col.name]); #simu.overlap.counts = cbind(as.numeric(names(simu.t)), as.vector(simu.t)/sum(as.vector(simu.t))); # overlap number, proportion simu.overlap.counts = cbind(as.numeric(names(simu.t)), as.vector(simu.t)); # overlap number, frequency lines(simu.overlap.counts, ty="l", pch=i, col=cols[i]); # reuse the colors } legend("topright", legend=site.prob.increase.factors, #pch=1:length(site.prob.increase.factors), bty="n", lty=1, col=cols[1:length(site.prob.increase.factors)], cex=0.8); if (output == "pdf" | output == "png") { dev.off(); } } } } } } place.repeats = function(processed.rmsk.filename, chrs.mtx) { repeats = as.matrix(read.table(processed.rmsk.filename, header=F, comment.char="#")); repeats[repeats[ ,2]==0,2] = repeats[repeats[ ,2]==0,2] + 1; # change from 0 to 1 coordiantes chrs = sort(unique(repeats[,1])); free.rgns = matrix(-1, nrow=nrow(repeats)+length(chrs), ncol=3); free.rgns.count = 0; for (chr in chrs) { cat("chr", chr, "\n"); flush.console(); chr.start = chrs.mtx[chrs.mtx[,1]==chr,2]; chr.end = chrs.mtx[chrs.mtx[,1]==chr,3]; repeats.chr = repeats[repeats[,1]==chr, 2:3]; o = order(repeats.chr[,1]); # sort on the start, MUST! repeats.chr = repeats.chr[o, ]; repeats.chr = rbind(repeats.chr, c(10000000000, 10000000005)); # artificial repeat for pop the last real tag stack repeat.rgns.chr = matrix(-1, nrow=nrow(repeats.chr), ncol=2); repeat.rgns.chr.count = 0; mergedRepeat = c(-1, -1); # start, end of merged repeat for (i in 1:nrow(repeats.chr)) { if (mergedRepeat[1] == -1) { mergedRepeat = repeats.chr[i, ]; } else { if (repeats.chr[i,1] - mergedRepeat[2] <= 1) { mergedRepeat = c( min(mergedRepeat[1], repeats.chr[i,1]), max(mergedRepeat[2], repeats.chr[i,2]) ); } else { repeat.rgns.chr.count = repeat.rgns.chr.count + 1; repeat.rgns.chr[repeat.rgns.chr.count, ] = mergedRepeat; mergedRepeat = c(-1, -1); mergedRepeat = repeats.chr[i, ]; } } if (i %% 50000 == 0) { cat("\t", i, "\n"); flush.console(); } } repeat.rgns.chr = repeat.rgns.chr[repeat.rgns.chr[,1]!=-1, ]; # get rid of the redundant rows for (i in 1:nrow(repeat.rgns.chr)) { if (i == 1) # the first repeat region { if (repeat.rgns.chr[i,1] != chr.start) # this first repart region starts at chr.start, ignore it because there is no free rgn before it { free.rgns.count = free.rgns.count + 1; free.rgns[free.rgns.count, ] = c(chr, chr.start, repeat.rgns.chr[i,1]-1); if (chr.start >= repeat.rgns.chr[i,1]-1) { cat("#1 ", free.rgns.count, c(chr, chr.start, repeat.rgns.chr[i,1]-1), repeat.rgns.chr[i,], "\n"); flush.console(); }; } } else { if (i == nrow(repeat.rgns.chr)) # the last repeat region { free.rgns.count = free.rgns.count + 1; free.rgns[free.rgns.count, ] = c(chr, repeat.rgns.chr[i-1,2]+1, repeat.rgns.chr[i,1]-1); if (repeat.rgns.chr[i-1,2]+1 > repeat.rgns.chr[i,1]-1) { cat("#2 ", free.rgns.count, c(chr, repeat.rgns.chr[i-1,2]+1, repeat.rgns.chr[i,1]-1), repeat.rgns.chr[i,], "\n"); flush.console(); }; if (repeat.rgns.chr[i,2] != chr.end) # this last repart region ends at chr.end, ignore it because there is no free rgn after it { free.rgns.count = free.rgns.count + 1; free.rgns[free.rgns.count, ] = c(chr, repeat.rgns.chr[i,2]+1, chr.end); if (repeat.rgns.chr[i,2]+1 > chr.end) { cat("#3 ", free.rgns.count, c(chr, repeat.rgns.chr[i,2]+1, chr.end), repeat.rgns.chr[i,], "\n"); flush.console(); }; } } else { free.rgns.count = free.rgns.count + 1; free.rgns[free.rgns.count, ] = c(chr, repeat.rgns.chr[i-1,2]+1, repeat.rgns.chr[i,1]-1); if (repeat.rgns.chr[i-1,2]+1 > repeat.rgns.chr[i,1]-1) { cat("#4 ", free.rgns.count, c(chr, repeat.rgns.chr[i-1,2]+1, repeat.rgns.chr[i,1]-1), repeat.rgns.chr[i,], "\n"); flush.console(); }; } } } } free.rgns = free.rgns[free.rgns[,1]!=-1, ]; # chr cannot be -1 return(free.rgns); } place.sites = function(site.lens, free.rgns) { sites = matrix(-1, nrow=0, ncol=3); # col: chr (1-22, 23=X, 24=Y), start, end, # the length of site.lens is the number of sites to be placed site.len.table = rev(table(site.lens)); # from large to small, MUST site.len.levels = as.numeric(names(site.len.table)); for (site.len.level in site.len.levels) { free.rgns[,3] = free.rgns[ ,3] - site.len.level + 1; # take care of the end edges free.rgns.bad = free.rgns[free.rgns[ ,3]=free.rgns[ ,2], , drop=F]; num.placeable.sites = min(nrow(free.rgns.good), site.len.table[as.character(site.len.level)]) sampled.free.rgns.good.inds = sample(nrow(free.rgns.good), num.placeable.sites, prob=(free.rgns.good[,3]-free.rgns.good[,2]+1)); # select a row, using the rgn lengths as weights cat("\tPlacing sites... ", site.len.level, "bp sites:", site.len.table[as.character(site.len.level)], " free.rgns before:", nrow(free.rgns), " free.rgns.good:", nrow(free.rgns.good), " site placed:", num.placeable.sites, "\n"); flush.console(); # place the sites site.starts = sapply(sampled.free.rgns.good.inds, function(ind) sample(free.rgns.good[ind,3]-free.rgns.good[ind,2]+1, 1) + free.rgns.good[ind,2] - 1); site.ends = site.starts + site.len.level - 1; sites = cbind(free.rgns.good[sampled.free.rgns.good.inds,1,drop=F], site.starts, site.ends); # split the select good rgn on the site free.rgns.good[,3] = free.rgns.good[ ,3] + site.len.level - 1; # add the length back sampled.free.rgns.good = free.rgns.good[sampled.free.rgns.good.inds, ,drop=F]; free.rgns.good = free.rgns.good[-sampled.free.rgns.good.inds, ]; # remove the selected rgns free.rgns.good = rbind(free.rgns.good, cbind(sampled.free.rgns.good[,1], sampled.free.rgns.good[,2], site.starts-1)); free.rgns.good = rbind(free.rgns.good, cbind(sampled.free.rgns.good[,1], site.ends+1, sampled.free.rgns.good[3])); # add the bad rgns back, as they may be used for smaller sites free.rgns.bad[,3] = free.rgns.bad[ ,3] + site.len.level - 1; # add the length back free.rgns = rbind(free.rgns.bad, free.rgns.good); site.len.table[names(site.len.table)==as.character(site.len.level)] = site.len.table[names(site.len.table)==as.character(site.len.level)]-num.placeable.sites; site.lens = rep(site.len.levels, site.len.table); } # sorting o = order(sites[ ,1], sites[ ,2]); sites = sites[o, , drop=F]; free.rgns = free.rgns[free.rgns[ ,3]>=free.rgns[ ,2], , drop=F]; o = order(free.rgns[ ,1], free.rgns[ ,2]); # sort on the chr and then the start free.rgns = free.rgns[o, ]; rownames(free.rgns) = c(); cat("\t\t\tfree.rgns after:", nrow(free.rgns), " site not placed:", length(site.lens), "\n"); flush.console(); genome = list(sites, free.rgns); names(genome) = c("sites", "free.rgns"); return(genome); } simulate = function(genome, site.prob.increase.factors, frgd.inter.prfls, frgd.inter.prfl.pars.list, frgd.intra.prfls, bkgd.distrs, bkgd.distr.pars.list, # default bkgd.distr.pars including 'unif' tag.lens, # tag.lens: a vector of tag lengths and the length of the vector is the number of tags file.name.base.start, file.name.base.end ) { sites = genome$sites; site.len = sites[1,3] - sites[1,2] + 1; num.sites = nrow(sites); for (site.prob.increase.factor in site.prob.increase.factors) { for (frgd.inter.prfl in frgd.inter.prfls) { frgd.inter.prfl.pars = frgd.inter.prfl.pars.list[[frgd.inter.prfl]]; if (frgd.inter.prfl == "binom") { frgd.inter.prfl.pars = c(50, 0.01, 0.1); } for (frgd.intra.prfl in frgd.intra.prfls) { for (bkgd.distr in bkgd.distrs) { #bkgd.distr.pars = c(); # default including 'unif' #if (bkgd.distr == "gamma") { bkgd.distr.pars = c(2, 3); } #if (bkgd.distr == "lnorm") { bkgd.distr.pars = c(1, 1); } bkgd.distr.pars = bkgd.distr.pars.list[[bkgd.distr]]; file.name.middle = paste(paste(site.len, num.sites, sep="-"), site.prob.increase.factor, paste(c(frgd.inter.prfl, frgd.inter.prfl.pars), collapse="-"), frgd.intra.prfl, paste(c(bkgd.distr, bkgd.distr.pars), collapse="-"), sep="_"); cat("\tSimulating... ", file.name.middle, "\n"); flush.console(); tags = place.tags(tag.lens, site.prob.increase.factor, genome, frgd.inter.prfl, frgd.inter.prfl.pars, frgd.intra.prfl, bkgd.distr, bkgd.distr.pars); print.tags(tags, paste(file.name.base.start, file.name.middle, file.name.base.end, sep="_")); } } } } } place.tags = function(tag.lens, site.prob.increase.factor, genome, frgd.inter.prfl, frgd.inter.prfl.pars, frgd.intra.prfl, # inter: among sites; intra: within each site bkgd.distr, bkgd.distr.pars) { # initial stuff sites = genome$sites; site.lens = sites[,3] - sites[,2] + 1; site.total.len = sum(site.lens); site.len.max = max(site.lens); free.rgns = genome$free.rgns; free.rgn.lens = free.rgns[,3] - free.rgns[,2] + 1; free.rgn.total.len = sum(free.rgn.lens); # make the matrix of free rgn blocks block.size = 1000; # 1 Kb block total.num.blocks = ceiling(free.rgn.total.len/block.size); #free.rgn.blocks = matrix(-1, nrow=nrow(free.rgns)*2, ncol=5); # col: chr, start, end, weights, free.rgn.ind free.rgn.blocks = matrix(-1, total.num.blocks*10, ncol=5); # col: chr, start, end, weights, free.rgn.ind free.rgn.block.count = 0; for (i in 1:nrow(free.rgns)) { free.rgn = free.rgns[i,]; num.blocks = ceiling(free.rgn.lens[i]/block.size); for (j in 1:num.blocks) { block.start = (j-1)*block.size + free.rgns[i,2]; block.end = block.start + block.size - 1; if (block.end > free.rgns[i,3]) { block.end = free.rgns[i,3] } free.rgn.block.count = free.rgn.block.count + 1; free.rgn.blocks[free.rgn.block.count, ] = c(free.rgns[i,1], block.start, block.end, 0, i); } } free.rgn.blocks = free.rgn.blocks[free.rgn.blocks[,1]!=-1, ]; # remove unused rows free.rgn.block.lens = free.rgn.blocks[,3] - free.rgn.blocks[,2] + 1; cat("\t\t\tfree.rgns:", nrow(free.rgns), " free.rgn.blocks:", nrow(free.rgn.blocks), "\n"); flush.console(); # add the sampling weights to free rgn blocks if (bkgd.distr == "unif") # the weight is the length { free.rgn.blocks[,4] = free.rgn.block.lens; } if (bkgd.distr == "gamma") # generate one rand num for each block and the weight of a block is the rand num multiplied by its length { free.rgn.blocks[,4] = free.rgn.block.lens * rgamma(nrow(free.rgn.blocks), shape=bkgd.distr.pars[1], scale=bkgd.distr.pars[2]); } if (bkgd.distr == "lnorm") { free.rgn.blocks[,4] = free.rgn.block.lens * rlnorm(nrow(free.rgn.blocks), meanlog=bkgd.distr.pars[1], sdlog=bkgd.distr.pars[2]); } free.rgn.weight.total = sum(free.rgn.blocks[,4]); free.rgn.weight.base.average = free.rgn.weight.total/free.rgn.total.len; # calc the sampling weights for site site.weight.base.average = free.rgn.weight.base.average * site.prob.increase.factor; site.weight.total = site.weight.base.average * site.total.len; # distribute the total site weight among sites site.weight.margin = numeric(nrow(sites)); if (frgd.inter.prfl == "unif" || frgd.inter.prfl == "powerlaw") # initail of powerlaw is the same as unif { site.weight.margin = site.weight.base.average * site.lens; } if (frgd.inter.prfl == "binom") { binom.size = frgd.inter.prfl.pars[1]; binom.prob = frgd.inter.prfl.pars[2]; binom.rsdl = frgd.inter.prfl.pars[3]; site.weight.margin.coef = rep(dbinom(0:binom.size, binom.size, binom.prob)+binom.rsdl, each=ceiling(nrow(sites)/(binom.size+1)))[1:nrow(sites)]; site.weight.margin.coef = site.weight.margin.coef * site.lens; # weight the coefs by the site length site.weight.margin.coef = site.weight.margin.coef / sum(site.weight.margin.coef); # normalize site.weight.margin = sample(site.weight.margin.coef) * site.weight.total; } # distribute the weight of each site among its length site.weights = vector("list", nrow(sites)); for (i in 1:nrow(sites)) { if (frgd.intra.prfl == "unif") { site.weights[[i]] = rep(site.weight.base.average, site.lens[i]); } if (frgd.intra.prfl == "binom") { # note: the dbinom vector sums to 1, so the total weight mass for each site is the same as under 'unif' # p = 0.5 for symmetry. n = sites[i,4]-1 to make the vector to be sites[ ,4] long. site.weights[[i]] = site.weight.margin[i] * dbinom(0:(site.lens[i]-1), site.lens[i]-1, 0.5); } if (frgd.intra.prfl == "tri") { # note: the dbinom vector sums to 1, so the total weight mass for each site is the same as under 'unif' # p = 0.5 for symmetry. n = sites[i,4]-1 to make the vector to be sites[ ,4] long. site.weights[[i]] = site.weight.margin[i] * dtriangular(1:site.lens[i]); } } # add the vector of sampling weights to sites sites = cbind(sites, site.weight.margin); # place tags -- 1: partition tags into tags in sites and tags in free rgns num.tags = length(tag.lens); gtypes = sample(names(genome), num.tags, prob=c(site.weight.total, free.rgn.weight.total), replace=T); num.tags.in.sites = sum(gtypes=="sites"); tag.lens.in.sites = tag.lens[gtypes=="sites"]; num.tags.in.free.rgns = sum(gtypes=="free.rgns"); tag.lens.in.free.rgns = tag.lens[gtypes=="free.rgns"]; cat("\t\t\tnum.tags.in.sites:", num.tags.in.sites, " num.tags.in.free.rgns", num.tags.in.free.rgns, "\n"); flush.console(); # place tags -- 2: place tags in free rgns sampled.free.rgn.block.inds = sample(nrow(free.rgn.blocks), size=num.tags.in.free.rgns, prob=free.rgn.blocks[,4], replace=T); # NOTE using 'sample(x=free.rgn.blocks[ind,2]:free.rgn.blocks[ind,3], size=1)' is VERY slow tag.middles.in.free.rgns = sapply(sampled.free.rgn.block.inds, function(ind) sample(x=free.rgn.block.lens[ind], size=1)+free.rgn.blocks[ind,2]-1); first.half.tag.lens.in.free.rgns = second.half.tag.lens.in.free.rgns = floor((tag.lens.in.free.rgns-1)/2); second.half.tag.lens.in.free.rgns[tag.lens.in.free.rgns %% 2 == 0] = second.half.tag.lens.in.free.rgns[tag.lens.in.free.rgns %% 2 == 0] + 1; tag.starts.in.free.rgns = tag.middles.in.free.rgns - first.half.tag.lens.in.free.rgns; tag.ends.in.free.rgns = tag.middles.in.free.rgns + second.half.tag.lens.in.free.rgns; tags.in.free.rgns = cbind(free.rgn.blocks[sampled.free.rgn.block.inds,1], tag.starts.in.free.rgns, tag.ends.in.free.rgns); sampled.free.rgn.block.inds.table = sort(table(sampled.free.rgn.block.inds), decreasing=T); # large to small sampled.free.rgn.block.inds.list = as.numeric(names(sampled.free.rgn.block.inds.table)); # place tags -- 3: place tags in sites sampled.site.inds = numeric(num.tags.in.sites); if (frgd.inter.prfl == "powerlaw") # prefereintial attachment, tags are placed one by one { pref.atta.increase.coef = frgd.inter.prfl.pars[1]; for (i in 1:num.tags.in.sites) { sampled.site.ind = sample(nrow(sites), 1, prob=sites[,4]); sites[sampled.site.ind,4] = sites[sampled.site.ind,4] + pref.atta.increase.coef*site.weight.margin[sampled.site.ind]; sampled.site.inds[i] = sampled.site.ind; if (i %% 2000 == 0) { cat(i, "tag placed\n"); flush.console(); } } } else { sampled.site.inds = sample(nrow(sites), num.tags.in.sites, prob=sites[,4], replace=T); } tag.middles.in.sites = as.numeric(sapply(sampled.site.inds, function(ind) sample(x=site.lens[ind], size=1, prob=site.weights[[ind]])+sites[ind,2]-1)); first.half.tag.lens.in.sites = second.half.tag.lens.in.sites = floor((tag.lens.in.sites-1)/2); second.half.tag.lens.in.sites[tag.lens.in.sites %% 2 == 0] = second.half.tag.lens.in.sites[tag.lens.in.sites %% 2 == 0] + 1; tag.starts.in.sites = tag.middles.in.sites - first.half.tag.lens.in.sites; tag.ends.in.sites = tag.middles.in.sites + second.half.tag.lens.in.sites; tags.in.sites = cbind(sites[sampled.site.inds,1], tag.starts.in.sites, tag.ends.in.sites); # combine tags = rbind(tags.in.free.rgns, tags.in.sites); o = order(tags[ ,1], tags[ ,2]); tags = tags[o, ]; return(tags); } dtriangular = function(x) { x1 = x2 = numeric(); y1 = y2 = numeric(); if (length(x) %% 2 == 0) # even { x1 = x[ 1:(length(x)/2) ]; x2 = x[ -(1:(length(x)/2)) ]; y1 = x1; y2 = rev(y1); } else # odd { x1 = x[ 1:ceiling(length(x)/2) ]; x2 = x[ -(1:ceiling(length(x)/2)) ]; y1 = x1; y2 = (-1)*x2 + 2*y1[length(y1)]; } y = c(y1, y2); return(y/sum(y)); } print.tags = function(tags, file.name) { num.tags = nrow(tags); tags.out = cbind(paste(">", 1:num.tags, sep=""), rep("..", num.tags), rep("U0", num.tags), rep(1, num.tags), rep(0, num.tags), rep(0, num.tags), tags[ ,1], tags[ ,2], rep("F", num.tags)); write.table(tags.out, file=file.name, quote=F, sep="\t", row.names=F, col.names=F); }