setwd("~/Documents/AdminStuff/Lizzie/August2017/")
test<-read.table("batch_14mike.structure.tsv",header=T,row.names=NULL,na.strings=0) #just put a name on top of the individual names/ Pop column so it will import the comment lines dont matter
#40 obs so 20 indidivuals

#first two rows should be fine as they have no NA's
PropotionNA<-apply(test, 2, function(col)sum(is.na(col))/length(col)<=0.2) #change 0.5 to reflect the proportion of missing data you want to remove/allow
test.cleaned<-test[PropotionNA]

write.table(test.cleaned,"batch_14mike.structureMissing20Percent.stru",row.names=F,quote=F,sep="\t",na = '-9')


#######adegenet#############

library(adegenet)

#its been a while since I used this package
#####
#PCA#
#####

placo<-read.table("batch_14mike.structureMissing20Percent.stru",sep="\t",header=T,row.names=NULL)
#create a genind object from the structure file:
#file needs to end in ".stru"
?read.structure
#change n.loc to reflect prefiltering
placo.genind <- read.structure("batch_14mike.structureMissing20Percent.stru", onerowperind=FALSE, n.ind=19, n.loc=341, col.lab=1, col.pop=2, ask=FALSE, quiet=TRUE,NA.char = "-9")
str(placo.genind$pop)
placo.genind$pop #levels look right 
placo.genind$all.names #think this is the alleles at each locus (01,03)
indNames(placo.genind) #names

#lets do a PCA
?dudi.pca
#cant have missing data in a PCA
?tab
placo.genind.missingMean <- tab(placo.genind, NA.method="mean")
pca1 <- dudi.pca(placo.genind.missingMean,scannf=FALSE,scale=FALSE) #center=TRUE is default
pca2 <- dudi.pca(placo.genind.missingMean,scannf=TRUE,scale=TRUE) #scannf just plots the eigens automatically scale norms by row.w doesnt seem to make any difference here

plot(pca1$li)
pca1$eig
pca1$co

#make it look pretty

#do eigen %
pca1$eigval <- pca1$eig/sum(pca1$eig)*100
pca1$eigval

title <- paste("PC1 (",round(pca1$eigval[1],digits=2), " %)"," / PC2 (",round(pca1$eigval[2],digits=2)," %)",sep="",collapse="")
title

library(ggplot2)
library(ggrepel)

plottets<-ggplot() +
  geom_point(data=pca1$li, aes(Axis1, Axis2)) + 
  ggtitle(title) + 
  theme_bw () + 
  theme (legend.key.size=unit (0.05,'cm'))

plot(plottets)

#colours
#just the first two rows of the table, I was going to add another for species but they all seem to be ambigious (not 100% I have the species right)
pops<-read.table("placorm257.pops",header=T,row.names=NULL,sep="\t") #this works for the mikes filtered set as well
length(pops$Population)
plottets<-ggplot(data=pca1$li, aes(Axis1, Axis2,col=as.factor(pops$Population),label=row.names(pca1$l1))) + 
  geom_point(size=3) + 
  geom_text_repel(fontface = 'bold', colour="black", arrow = arrow(length = unit(0.01, 'npc')),force = 10,point.padding = unit(0.5, "lines")) + 
  ggtitle(title) + 
  theme_bw () +
  labs(colour="Pop grouping") 
plottets

plottets<-ggplot(data=pca1$li, aes(Axis1, Axis2,color=pops$species,label=row.names(pca1$l1))) + 
  geom_point(size=3) + 
  geom_text_repel(fontface = 'bold', point.padding = unit(0.5, "lines"),colour="black",force = 5) + 
  ggtitle(title) + 
  theme_bw () +
  labs(colour="Sub Species")
plottets
library(ggrepel)
row.names(pca1$l1)
plottets<-ggplot(data=pca1$li, aes(Axis1, Axis2,shape=as.factor(pops$Population),label=row.names(pca1$l1),col=pops$species)) + 
  geom_point(size=3) + 
  geom_text_repel(fontface = 'bold', point.padding = unit(0.5, "lines"),colour="black") + 
  ggtitle(title) + 
  theme_bw () +
  labs(colour="Populations")
plottets





library(ggrepel)
plottets<-ggplot(data=pca1$li, aes(Axis1, Axis2,color=pops$Pop,label=pops$Ind)) + 
  geom_point(size=3) + 
  geom_text_repel(fontface = 'bold', point.padding = unit(0.5, "lines"),colour="black") + 
  ggtitle(title) + 
  theme_bw () +
  labs(colour="Sub Species")
plottets



#add some circles
#not really going to work here unless they are regrouped more broadly probably

library(ellipse)
group <- pops$Pop
x = pca1$li$Axis1
y = pca1$li$Axis2
df <- data.frame(x=x, y=y, group=factor(group))
df_ell <- data.frame()
for(g in levels(df$group)){df_ell <- rbind(df_ell, cbind(as.data.frame(with(df[df$group==g,], ellipse(cor(x, y),scale=c(sd(x),sd(y)),centre=c(mean(x),mean(y))))),group=g))} 

plottets<-ggplot() + 
  geom_point(data=pca1$li, aes(Axis1, Axis2,color=pops$Pop)) + 
  ggtitle(title) + 
  geom_path(data=df_ell, aes(x=x, y=y,colour=group)) + 
  theme_bw() +
  labs(colour="Sub Species")
plottets

#I might have messed up the subspecies names because when I do this it looks shonky as hell

######
#DAPC#
######
#this isnt working and im not 100% sure why
#i think it is due to the small number of indidivuals


placo.genind <- read.structure("batch_21.structureMissing20Percent.newpops.stru", onerowperind=FALSE, n.ind=23, n.loc=1804, col.lab=1, col.pop=2, ask=FALSE, quiet=TRUE,NA.char = "-9")

grp <- find.clusters(placo.genind, max.n.clust=4)

names(grp)
dapc1 <- dapc(placo.genind, grp$grp)
table(pop(placo.genind), grp$grp)
grp$size

scatter(dapc1)

#where are the points?
dapc1$ind.coord
#are they are all on top of one another
#.....something is wrong with this method on this data...I've never tried to do this on such a small dataset
#i'll have a google over the next week or so to see if I can come up with a solution otherwise it might just have to be a PCA




#tutorial
data(dapcIllus)
x <- dapcIllus$a
grp <- find.clusters(x, max.n.clust=40)
table(pop(x), grp$grp)
grp$size
length(x$pop) #600 individuals
dapc1 <- dapc(x, grp$grp)
scatter(dapc1)

gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

n = 10
cols = gg_color_hue(n)
cols

