library(readxl) #<-- to read excel file
library(dplyr) #<-- to use select function
library(psych) #<-- to use pipe %>%
library(haven)


#data cleaning
data00 <- read_excel('phase1_dataexcel.xlsx', sheet=2)
data01 <- select(data00, 'Indexno', 'DiagnosedTB', 'Age', 'Gender', 'Maritalstatus', 'CrowdedHome', 
                 'Highincidencearea', 'HCW', 'Educationstatnew', 'Incomegroup', 'DM', 'HIV',
                 'Immunosuppresed', 'Smokingstatus', 'SmokeExposure', 'BCGscar', 'PrevioushxTB', 'Tbcontact',
                 'Cough', 'Sputum', 'Nightsweat', 'Chestpain', 'Weightloss', 'Lossappetite',
                 'Fever', 'Hemoptysis', 'Durationsymptoms')


data02 <- data01 %>% mutate(DiagnosedTB2 = as_factor(DiagnosedTB),
                            Gender2 = as_factor(Gender),
                            Maritalstatus2 = as_factor(Maritalstatus),
                            CrowdedHome2 = as_factor(CrowdedHome),
                            Highincidencearea2 = as_factor(Highincidencearea),
                            HCW2 = as_factor(HCW),
                            Educationstatnew2 = as_factor(Educationstatnew),
                            Incomegroup2 = as_factor(Incomegroup),
                            DM2 = as_factor(DM),
                            HIV2 = as_factor(HIV),
                            Immunosuppresed2 = as_factor(Immunosuppresed),
                            Smokingstatus2 = as_factor(Smokingstatus),
                            BCG2 = as_factor(BCGscar),
                            PrevioushxTB2 = as_factor(PrevioushxTB),
                            Tbcontact2 = as_factor(Tbcontact),
                            Cough2 = as_factor(Cough),
                            Sputum2 = as_factor(Sputum),
                            Nightsweat2 = as_factor(Nightsweat),
                            Chestpain2 = as_factor(Chestpain),
                            Weightloss2 = as_factor(Weightloss),
                            Lossappetite2 = as_factor(Lossappetite),
                            Fever2 = as_factor(Fever),
                            Hemoptysis2 = as_factor(Hemoptysis))


#clinical characteristics
library(ggplot2)
ggplot(data = data02, mapping = aes(x = Durationsymptoms)) +
  geom_histogram() + facet_grid(. ~ DiagnosedTB)

mean(data02$Age)
dataexp1 <- data02 %>% filter(DiagnosedTB == "Yes") 
mean(dataexp1$MonthlyIncome)
sd(dataexp1$SmokeExposure)
summary(dataexp1$Incomegroup2)
IQR(dataexp1$Durationsymptoms)
(1/(45))*100

dataexp2 <- data02 %>% filter(DiagnosedTB == "No")
mean(dataexp2$MonthlyIncome)
sd(dataexp2$SmokeExposure)
summary(dataexp2$Incomegroup2)
IQR(dataexp2$Durationsymptoms)
(1/(114))*100

levels(data02$DiagnosedTB2)


#simple logistic regression
dataslog0Age <- glm(DiagnosedTB2 ~ Age, data=data02, family=binomial (link= "logit"))
summary(dataslog0Age)
exp(coefficients(dataslog0Age))
exp(confint(dataslog0Age))

dataslog0Gender2 <- glm(DiagnosedTB2 ~ Gender2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Gender2)
exp(coefficients(dataslog0Gender2))
exp(confint(dataslog0Gender2))

dataslog0Maritalstatus2 <- glm(DiagnosedTB2 ~ Maritalstatus2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Maritalstatus2)
exp(coefficients(dataslog0Maritalstatus2))
exp(confint(dataslog0Maritalstatus2))

dataslog0CrowdedHome2 <- glm(DiagnosedTB2 ~ CrowdedHome2, data=data02, family=binomial (link= "logit"))
summary(dataslog0CrowdedHome2)
exp(coefficients(dataslog0CrowdedHome2))
exp(confint(dataslog0CrowdedHome2))

dataslog0Highincidencearea2 <- glm(DiagnosedTB2 ~ Highincidencearea2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Highincidencearea2)
exp(coefficients(dataslog0Highincidencearea2))
exp(confint(dataslog0Highincidencearea2))

dataslog0HCW2 <- glm(DiagnosedTB2 ~ HCW2, data=data02, family=binomial (link= "logit"))
summary(dataslog0HCW2)
exp(coefficients(dataslog0HCW2))
exp(confint(dataslog0HCW2))

data02$Educationstatnew2 <- factor(data02$Educationstatnew2, levels = c('Tertiaryabove', 'Secondary', 'Primary'))
levels(data02$Educationstatnew2)
dataslog0Educationstatnew2 <- glm(DiagnosedTB2 ~ Educationstatnew2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Educationstatnew2)
exp(coefficients(dataslog0Educationstatnew2))
exp(confint(dataslog0Educationstatnew2))

dataslog0MonthlyIncome <- glm(DiagnosedTB2 ~ MonthlyIncome, data=data02, family=binomial (link= "logit"))
summary(dataslog0MonthlyIncome)
exp(coefficients(dataslog0MonthlyIncome))
exp(confint(dataslog0MonthlyIncome))

data02$Incomegroup2 <- factor(data02$Incomegroup2, levels = c('M40', 'B40'))
levels(data02$Incomegroup2)
dataslog0Incomegroup2 <- glm(DiagnosedTB2 ~ Incomegroup2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Incomegroup2)
exp(coefficients(dataslog0Incomegroup2))
exp(confint(dataslog0Incomegroup2))

dataslog0DM2 <- glm(DiagnosedTB2 ~ DM2, data=data02, family=binomial (link= "logit"))
summary(dataslog0DM2)
exp(coefficients(dataslog0DM2))
exp(confint(dataslog0DM2))

dataslog0HIV2 <- glm(DiagnosedTB2 ~ HIV2, data=data02, family=binomial (link= "logit"))
summary(dataslog0HIV2)
exp(coefficients(dataslog0HIV2))
exp(confint(dataslog0HIV2))

dataslog0Immunosuppresed2 <- glm(DiagnosedTB2 ~ Immunosuppresed2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Immunosuppresed2)
exp(coefficients(dataslog0Immunosuppresed2))
exp(confint(dataslog0Immunosuppresed2))

dataslog0Smokingstatus2 <- glm(DiagnosedTB2 ~ Smokingstatus2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Smokingstatus2)
exp(coefficients(dataslog0Smokingstatus2))
exp(confint(dataslog0Smokingstatus2))

dataslog0SmokeExposure <- glm(DiagnosedTB2 ~ SmokeExposure, data=data02, family=binomial (link= "logit"))
summary(dataslog0SmokeExposure)
exp(coefficients(dataslog0SmokeExposure))
exp(confint(dataslog0SmokeExposure))

dataslog0BCG2 <- glm(DiagnosedTB2 ~ BCG2, data=data02, family=binomial (link= "logit"))
summary(dataslog0BCG2)
exp(coefficients(dataslog0BCG2))
exp(confint(dataslog0BCG2))

dataslog0PrevioushxTB2 <- glm(DiagnosedTB2 ~ PrevioushxTB2, data=data02, family=binomial (link= "logit"))
summary(dataslog0PrevioushxTB2)
exp(coefficients(dataslog0PrevioushxTB2))
exp(confint(dataslog0PrevioushxTB2))

dataslog0Tbcontact2 <- glm(DiagnosedTB2 ~ Tbcontact2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Tbcontact2)
exp(coefficients(dataslog0Tbcontact2))
exp(confint(dataslog0Tbcontact2))

data02$Cough2 <- factor(data02$Cough2, levels = c('No', 'Yes'))    
levels(data02$Cough2)
dataslog0Cough2 <- glm(DiagnosedTB2 ~ Cough2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Cough2)
exp(coefficients(dataslog0Cough2))
exp(confint(dataslog0Cough2))

dataslog0Sputum2 <- glm(DiagnosedTB2 ~ Sputum2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Sputum2)
exp(coefficients(dataslog0Sputum2))
exp(confint(dataslog0Sputum2))

dataslog0Nightsweat2 <- glm(DiagnosedTB2 ~ Nightsweat2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Nightsweat2)
exp(coefficients(dataslog0Nightsweat2))
exp(confint(dataslog0Nightsweat2))
data02$Nightsweat2 <- factor(data02$Nightsweat2, levels = c('No', 'Yes'))    
levels(data02$Nightsweat2)

dataslog0Chestpain2 <- glm(DiagnosedTB2 ~ Chestpain2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Chestpain2)
exp(coefficients(dataslog0Chestpain2))
exp(confint(dataslog0Chestpain2))


dataslog0Weightloss2 <- glm(DiagnosedTB2 ~ Weightloss2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Weightloss2)
exp(coefficients(dataslog0Weightloss2))
exp(confint(dataslog0Weightloss2))

dataslog0Lossappetite2 <- glm(DiagnosedTB2 ~ Lossappetite2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Lossappetite2)
exp(coefficients(dataslog0Lossappetite2))
exp(confint(dataslog0Lossappetite2))

dataslog0Fever2 <- glm(DiagnosedTB2 ~ Fever2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Fever2)
exp(coefficients(dataslog0Fever2))
exp(confint(dataslog0Fever2))

dataslog0Hemoptysis2 <- glm(DiagnosedTB2 ~ Hemoptysis2, data=data02, family=binomial (link= "logit"))
summary(dataslog0Hemoptysis2)
exp(coefficients(dataslog0Hemoptysis2))
exp(confint(dataslog0Hemoptysis2))

dataslog0Durationsymptoms<- glm(DiagnosedTB2 ~ Durationsymptoms, data=data02, family=binomial (link= "logit"))
summary(dataslog0Durationsymptoms)
exp(coefficients(dataslog0Durationsymptoms))
exp(confint(dataslog0Durationsymptoms))


#multiple logistic regression
datalog02v <- glm(DiagnosedTB2 ~ Highincidencearea2 + Educationstatnew2 + Smokingstatus2 + SmokeExposure +
                    PrevioushxTB2 + Nightsweat2 +
                    Chestpain2 + Weightloss2 + Lossappetite2 + Hemoptysis2 +
                    Durationsymptoms, data=data02, family=binomial (link= "logit"))
summary(datalog02v)
exp(coefficients(datalog02v))



#finalmodel
datalog08 <- glm(DiagnosedTB2 ~ 
                   Smokingstatus2 + 
                   Nightsweat2 +
                   Weightloss2 + Durationsymptoms, data=data02, family=binomial (link= "logit"))
summary(datalog08)
exp(coefficients(datalog08))
exp(confint(datalog08))

#checking for two-way interaction
datalog08.int1 <- glm(DiagnosedTB2 ~ Smokingstatus2 + Nightsweat2 + Weightloss2 + Durationsymptoms + Weightloss2*Durationsymptoms, 
                      data=data02, family=binomial (link= "logit"))
summary(datalog08.int1)


#model checking

library(generalhoslem)
logitgof(data02$DiagnosedTB2, fitted(datalog08), g = 10)

prob <- predict(datalog08, newdata=data02, type="response")
pred <- prediction(prob, datalog08$DiagnosticTB2)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf)
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
auc


#changing to numeric
data13 <- data02 %>% transform(DiagnosedTB3 = as.numeric(DiagnosedTB2), 
                               Smokingstatus3 = as.numeric(Smokingstatus2),
                               Nightsweat3 = as.numeric(Nightsweat2),
                               Weightloss3 = as.numeric(Weightloss2))
data16 <- dplyr::select(data13, 'DiagnosedTB3', 'Smokingstatus3', 'Nightsweat3', 'Weightloss3', 
                        'Durationsymptoms')


#checking multicollinearity
library(corpcor)
cor2pcor(cov(data16))
mc16 <- cor2pcor(cov(data16))
