######################################################################
# Code for "A guide to QTL mapping with R/qtl"
# Karl W Broman and Saunak Sen
#
# Chapter 3: Data checking
#######################################################################

######################################################################
# 3.1 Phenotypes
######################################################################

# Load the qtl and qtlbook packages and the ch3a data
library(qtl)
library(qtlbook)
data(ch3a)

# Code to make histograms of each phenotype in the ch3a data
par(mfrow=c(3,2))
for(i in 1:5)
  plot.pheno(ch3a, pheno.col=i)

# Code to create a scatterplot matrix of the phenotypes
# in the ch3a data
pairs(jitter( as.matrix(ch3a$pheno) ), cex=0.6, las=1)

# Remove 0 phenotypes
ch3a$pheno[ch3a$pheno == 0] <- NA

# Code to create a plots of each individual's average phenotype
# against its index and then against randomized indices
par(mfrow=c(1,2), las=1, cex=0.8)
means <- apply(ch3a$pheno, 1, mean)
plot(means)
plot(sample(means), xlab="Random index", ylab="means")

######################################################################
# 3.2 Segregation distortion
######################################################################

# Segregation distortion in the ch3b data
data(ch3b)
gt <- geno.table(ch3b)
gt[ gt$P.value < 1e-7, ]

# Segregation distortion in the listeria data
data(listeria)
gt <- geno.table(listeria)
p <- gt$P.value
gt[ !is.na(p) & p < 0.01, ]

######################################################################
# 3.3 Compare individuals' genotypes
######################################################################

# Histogram of the proportion of markers with identical 
# genotypes for each pair of individuals in the ch3a data.
data(ch3a)
cg <- comparegeno(ch3a)
hist(cg, breaks=200, 
     xlab="Proportion of identical genotypes")
rug(cg)

# Identify the relevant pairs
which(cg > 0.9, arr.ind=TRUE)

######################################################################
# 3.4 Check marker order
######################################################################

###################################
# 3.4.1 Pairwise recombination fractions
###################################

# Calculate recombination fractions between all pairs of 
# markers in the ch3c data
data(ch3c)
ch3c <- est.rf(ch3c)

# Get details on the markers with alleles potentially swapped
checkAlleles(ch3c)

# Look more carefully at the genotypes on chr 1,
# in order to figure out which of the markers
# has problems
pull.map(ch3c, 1)
geno.crosstab(ch3c, "c1m3", "c1m4")
geno.crosstab(ch3c, "c1m3", "c1m5")
geno.crosstab(ch3c, "c1m4", "c1m5")

# Swap the alleles at marker 2 and put the
# data back in the ch3c object.
g <- pull.geno(ch3c, 1)
g[,"c1m3"] <- 4 - g[,"c1m3"]
ch3c$geno[[1]]$data <- g

# Swap the alleles at marker 2 on chr 7
g <- pull.geno(ch3c, chr=7)
g[,"c7m2"] <- 4 - g[,"c7m2"]
ch3c$geno[[7]]$data <- g

# Rerun est.rf and checkAlleles
ch3c <- est.rf(ch3c)
checkAlleles(ch3c)

# Plot the recombination fractions for the ch3c data
plot.rf(ch3c, alternate.chrid=TRUE)

# Plot rec fracs for selected chromosomes
plot.rf(ch3c, chr=c(1, 7, 12, 13, 15))

# Estimate genetic map and plot it
nm <- est.map(ch3c, error.prob=0.001)
plot(nm)

# Get names of problem markers and then move them to the
# chromosomes to which they seem to belong
ch3c <- movemarker(ch3c, find.marker(ch3c, 7,index=4), 15)
ch3c <- movemarker(ch3c, find.marker(ch3c,12,index=2),  1)
ch3c <- movemarker(ch3c, find.marker(ch3c,12,index=1),  7)
ch3c <- movemarker(ch3c, find.marker(ch3c,13,index=1), 12)
ch3c <- movemarker(ch3c, find.marker(ch3c,15,index=5), 12)

# Plot the recombination fractions for the hyper data.
data(hyper)
hyper <- est.rf(hyper)
plot.rf(hyper, alternate.chrid=TRUE)

###################################
# 3.4.2 Rippling marker order
###################################

# Assess marker orders on chr 1 in the ch3c data
rip <- ripple(ch3c, 1, 5)

# Summary of the ripple results
summary(rip)

# Switch order of markers on chr 1
ch3c <- switch.order(ch3c, 1, rip[2,])

# ripple via maximum likelihood
rip <- ripple(ch3c, 1, 3, method="likelihood",
              error.prob=0.001, verbose=FALSE)
summary(rip)

# Apply ripple to all chromosomes
rip <- vector("list", nchr(ch3c))
names(rip) <- names(ch3c$geno)
for(i in names(ch3c$geno))
  rip[[i]] <- ripple(ch3c, i, 7, verbose=FALSE)

# Get improvement in number of obligate crossovers
dif.nxo <- sapply(rip, function(a) a[1,ncol(a)]-a[2,ncol(a)])
dif.nxo

# Switch orders on chromosomes showing possible improvement
for(i in names(ch3c$geno)) {
  if(dif.nxo[i] > 0)
    ch3c <- switch.order(ch3c, i, rip[[i]][2,])
}

# Repeat ripple with method="countxo"
for(i in names(ch3c$geno))
  rip[[i]] <- ripple(ch3c, i, 7, verbose=FALSE)
dif.nxo <- sapply(rip, function(a) a[1,ncol(a)]-a[2,ncol(a)])

any(dif.nxo > 0)

# Ripple all chromosomes by maximum likelihood
for(i in names(ch3c$geno))
  rip[[i]] <- ripple(ch3c, i, 3, method="likelihood", 
                     error.prob=0.001, verbose=FALSE)
lod <- sapply(rip, function(a) a[2, ncol(a)-1])

lod[lod > 0]

# summary of ripple results for X chromosome
summary(rip[["X"]])

# rerun est.rf and plot.rf
ch3c <- est.rf(ch3c)
plot.rf(ch3c, chr=c(1, 7, 12, 13, 15))

###################################
# 3.4.3 Estimate genetic map
###################################

# estimate map for ch3c data and plot it against that in the data
nm <- est.map(ch3c, error.prob=0.001, verbose=FALSE)
plot.map(ch3c, nm)

# replace map
ch3c <- replace.map(ch3c, nm)

######################################################################
# 3.5 Identifying genotyping errors
######################################################################

# calculate genotyping error LOD scores
data(hyper)
newmap <- est.map(hyper, error.prob=0.01)
hyper <- replace.map(hyper, newmap)
hyper <- calc.errorlod(hyper)

# The top error LOD scores for the hyper data
top <- top.errorlod(hyper, cutoff=5)
top

# plot genotypes for hyper data, chr 16
plot.geno(hyper, 16, top$id[top$chr==16], cutoff=5)

######################################################################
# 3.6 Counting crossovers
######################################################################

nxo <- countXO(hyper)
plot(nxo, ylab="No. crossovers")

# Individuals with many crossovers
nxo[nxo>25]

# ave no. XOs for first 92 and last 158 individuals
mean(nxo[1:92])
mean(nxo[-(1:92)])

# No. XOs by chromosome for individual 56.
countXO(hyper, bychr=TRUE)[56,]

######################################################################
# 3.7 Missing genotype information
######################################################################

# plot of missing genotype information in hyper data
plot.info(hyper, col=c("blue", "red"))

# save plot info results in an object
z <- plot.info(hyper, step=0)
z[ z[,1]==14, ]

# Plot of Number of missing genotypes at each marker
hist(nmissing(hyper, what="mar"), breaks=50)

# end of chap03.R