
options(stringsAsFactors=FALSE)

gen_individuals <- function(n,n.case=floor(n/2),penetrance=1,
                            background.proportion.phenotype=.05) {
  ## all cases are carriers
  ## (1 - penetrance) * backround.proportion.phenotype * n.controls...
  ## are controls that are carriers
  control.carriers <- round((1-penetrance) *
                            background.proportion.phenotype * (n-n.case))
  cat("Control carriers: ",control.carriers,"\n")
  ind.df <- data.frame(
                       subj.id=paste("subj_",1:n,sep=""),
                       case=FALSE,
                       carrier=FALSE
                       )
  rownames(ind.df) <- ind.df[,"subj.id"]
  ind.df[1:n.case,'case'] <- TRUE
  ind.df[which(ind.df[,'case']==TRUE),'carrier'] <- TRUE
  ## which ones are control carriers?
  if(control.carriers > 0) {
    rn <- sample(rownames(ind.df[which(ind.df[,'case']==FALSE),]),
                 control.carriers,replace=FALSE)
    ind.df[rn,'carrier'] <- TRUE
  }
  return(ind.df)
}

gen_snps <- function(n.snps,n.risk.snps,risk=NULL,MAF=NULL,
                     chr=10,start=10^6,stop=10^7,noise=1) {
  if(is.null(MAF)) {
    MAF <- runif(n.snps,0,.25)
  }
  if(length(MAF)==1) {
    MAF <- rep(MAF,n.snps)
  }
  if(is.null(risk)) {
    risk <- sample(4,n.risk.snps)
  }
  if(length(risk)==1) {
    risk <- rep(risk,n.risk.snps)
  }
  snp.df <- data.frame(
                       snp.id=paste("snp_",1:n.snps,sep=''),
                       MAF=MAF,
                       risk=1,
                       chr=chr,
                       pos=ceiling(seq(start,stop,length=n.snps))
                       )
  rownames(snp.df) <- snp.df[,'snp.id']
  risk.snps <- sample(rownames(snp.df),n.risk.snps)
  for(i in 1:n.risk.snps) {
    risk.snp <- risk.snps[i]
    snp.df[risk.snp,'risk'] <- risk[i]
  }
  ## throw in a little noise to make things interesting
  snp.df[,'risk'] <- jitter(snp.df[,'risk'],factor=noise)
  
  return(snp.df)
}

## Just a helper function for IDs
parse_subj_id_from_haplotype_id <- function(haplotype_ids) {
  sapply(strsplit(haplotype_ids,".",fixed=TRUE),function(x){x[1]})
}

## generate the matrix of genotypes
gen_genotypes <- function(snps.df,subjects.df) {
  n.snps <- nrow(snps.df)
  n.subjects <- nrow(subjects.df)

  ## generate the matrix of haplotypes that underlies the genotypes
  haplotype.matrix <- matrix(data=0,nrow=n.subjects*2,ncol=n.snps)
  rownames(haplotype.matrix) <- unlist(lapply(rownames(subjects.df),paste,c(1,2),sep="."))
  subjid.haprows.vector <- parse_subj_id_from_haplotype_id(rownames(haplotype.matrix))
  colnames(haplotype.matrix) <- rownames(snps.df)

  ## determine which of rows in the haplotype matrix belongs to a carrier
  carrier.subjid <- subjects.df[which(subjects.df[,'carrier']),'subj.id']
  noncarrier.subjid <- subjects.df[which(!(subjects.df[,'carrier'])),'subj.id']
  hap.carrier.idx <- which(subjid.haprows.vector %in% carrier.subjid)
  hap.noncarrier.idx <- which(subjid.haprows.vector %in% noncarrier.subjid)

  ## initially all loci are set to the major allele (0)
  ## decide which ones to set to 1 based on MAF (minor allele
  ## frequency) and risk
  ## This next block loops over each SNP setting the minor allele
  ## in the matrix
  for(i in 1:n.snps) {
    MAF <- snps.df[i,'MAF']
    risk <- snps.df[i,'risk']
    ## weight the selection of the minor allele by its overall
    ## frequency and by its risk
    minor.weight.carrier <- MAF * risk
    if(minor.weight.carrier > 1) {
      warn("Given this minor allele frequency, the risk factor seems too high")
      minor.weight.carrier <- 1
    }
    minor.weight.noncarrier <- MAF
    ## sample from the major and minor alleles given these weights
    res.carrier <- rbinom(length(hap.carrier.idx),1,minor.weight.carrier)
    res.noncarrier <- rbinom(length(hap.noncarrier.idx),1,minor.weight.noncarrier)
    haplotype.matrix[hap.carrier.idx,i] <- res.carrier
    haplotype.matrix[hap.noncarrier.idx,i] <- res.noncarrier
  }

  ## now we have a matrix of haplotypes, combine them into the genotypes
  genotype.matrix <- matrix(data=0,nrow=n.subjects,ncol=n.snps)
  rownames(genotype.matrix) <- rownames(subjects.df)
  colnames(genotype.matrix) <- rownames(snps.df)
  for(subj.id in rownames(genotype.matrix)) {
    mat.sub <- haplotype.matrix[which(subjid.haprows.vector==subj.id),]
    genotype.matrix[subj.id,] <- apply(mat.sub,2,sum)
  }

  return(genotype.matrix)
}

n.subjects <- 500
n.cases <- 250
penetrance <- 1
subjects.df <- gen_individuals(n.subjects,n.cases,penetrance)

n.snps <- 5000
n.risk.snps <- 5
risk <- seq(2,4,by=.5)
snps.df <- gen_snps(n.snps,n.risk.snps,risk=risk,MAF=NULL)
snps.df[,'col'] <- 'black'
snps.df[which(snps.df[,'risk'] > 1.5),'col'] <- 'red'

genotype.matrix <- gen_genotypes(snps.df,subjects.df)

## Take a look at the allele distribution:
table(as.vector(genotype.matrix))

library("snpStats")
genotype.sm <- as(genotype.matrix,"SnpMatrix")
results.sst <- single.snp.tests(subjects.df[,'case'],snp.data=genotype.sm)

## let's take a look:
summary(results.sst)

p1 <- p.value(results.sst,df=1)
plot(snps.df[,'pos'],-log10(p1),pch=19,cex=.5,col=snps.df[,'col'],
     main="Manhattan plot",xlab="Chr 10",ylab="-log10(p-value)",
     sub="Color indicates the alleles we created with elevated risk")
abline(h=8,lty=2)

chi2 <- chi.squared(results.sst,df=1)
qq.chisq(chi2,df=1,pch=19,cex=.5)

p1.o <- sort(p1)
snp.o <- snps.df[names(p1.o),]

plot(-log10((1:length(p1.o))/length(p1.o)),-log10(p1.o),
     pch=19,cex=.5,col=snp.o[,'col'],main="QQ plot",
     xlab="Expected",ylab="Observed")
abline(a=0,b=1,lty=2)
