Classification on 4FGL (Part1)
Via XG Boost, Random Forest and Boosted LR
SH
Other links
This is Script 2.
Source
“FGL4.rData” are downloaded from Fermi LAT data products (https://fermi.gsfc.nasa.gov/ssc/data/access/).
Reference publication:
“Classification and Ranking of Fermi LAT Gamma-ray Sources from the 3FGL Catalog Using Machine Learning Techniques”, Saz Parkinson, P. M. (HKU/LSR, SCIPP), Xu, H. (HKU), Yu, P. L. H. (HKU), Salvetti, D. (INAF-Milan), Marelli, M. (INAF-Milan), and Falcone, A. D. (Penn State), The Astrophysical Journal, 2016. (http://arxiv.org/abs/1602.00385)
Difference on feature selection between 4FGL & 3FGL results
4FGL field | 3FGL field | Note |
---|---|---|
PL_Flux_Density | Flux_Density | simple power-law form |
LP_Flux_Density | LogParabola form, for all significantly curved spectra except pulsars | |
PLEC_Flux_Density | PLSuperExpCutoff form, a subexponentially cutoff power law for all significantly curved pulsars | |
LP_SigCurv | Signif_Curve | Significance (in σ units) of the fit improvement between PowerLaw and PLSuperExpCutoff. A value greater than 4 indicates significant curvature |
PLEC SigCurv | ||
Flux_Band.2 | Flux100_300 | |
Flux_Band.3 | Flux300_1000 | |
Flux_Band.4 | Flux1000_3000 | |
Flux_Band.5 | Flux3000_10000 | |
Flux_Band.6 | Flux10000_100000 | replace 10-100 GeV with 10 - 30GeV |
Flux_Band.7 | Flux10000_100000 | replace 10-100 GeV with 30 - 300 GeV |
Step 1. Load fermicatsR package with added 4th Fermi LAT Catalogs; Split data
library(devtools)
load_all('fermicatsR') # load package w/o installing
library(fermicatsR)
library(dplyr)
library(caret)
library(DMwR)
load("fermicatsR/FGL4_results.rData")
load("fermicatsR/FGL4_tidy.rData")
# Set Random seed
set.seed(2)
# Select data for PSR (pulsarness == "Pulsar") vs AGN (agnness == "AGN") classification
FGL4_AGNPSR <- filter(FGL4_tidy, pulsarness == "Pulsar" | agnness == "AGN")
# Drop sources with missing values and some unused variables
FGL4_AGNPSR <- na.omit(FGL4_AGNPSR)
FGL4_AGNPSR <- select(FGL4_AGNPSR, -CLASS1, -ASSOC1, -ASSOC2, -CLASS2,
-SpectrumType, -Source_Name, -ASSOC_FGL, -GLAT, -Conf_95_PosAng, -agnness)
# Separate into training (70%) and testing (30%) sets
train_idx <- sample(nrow(FGL4_AGNPSR), round(nrow(FGL4_AGNPSR)*0.7))
FGL4_test <- FGL4_AGNPSR[-train_idx, ]
FGL4_train <- FGL4_AGNPSR[train_idx, ]
table(FGL4_train$pulsarness)
##
## Non-Pulsar Pulsar
## 2276 185
### SMOTE
# Very imbalanced dataset, so let's see if using smote can improve this model.
i <- grep("pulsarness", colnames(FGL4_train)) # Get index Class column
train_smote <- SMOTE(pulsarness ~ ., as.data.frame(FGL4_train),
perc.over = 1200, perc.under=100, #learner='rpartXse',se=0.5
)
table(train_smote$pulsarness)
##
## Non-Pulsar Pulsar
## 2220 2405
# table(train_smote$pulsarness)
# Split training data into k (10) blocks for k-fold cross-validation
k <- 10
Index <- sample(nrow(FGL4_train))
Block_index <- matrix(data = Index[1:(floor(nrow(FGL4_train)/k)*k)], nrow = k, byrow = TRUE)
FGL4_train_CV <- FGL4_train[Index[1:(floor(nrow(FGL4_train)/k)*k)], ]
# Initialise predictions vector
predictions_FGL4_train_CV <- rep(0, nrow(FGL4_train_CV))
# select the group of variable using LP spectrum type
FGL4_LP_train <- select(FGL4_train, -starts_with("PLEC_"), - starts_with("PL_"))
FGL4_LP_test <- select(FGL4_test, -starts_with("PLEC_"), - starts_with("PL_"))
FGL4_LP_train_CV <- select(FGL4_train_CV, -starts_with("PLEC_"), - starts_with("PL_"))
smote_LP_train <- select(train_smote, -starts_with("PLEC_"), - starts_with("PL_"))
# select the group of variable using PLEC spectrum type
FGL4_PLEC_train <- select(FGL4_train, -starts_with("PL_"), - starts_with("LP_"))
FGL4_PLEC_test <- select(FGL4_test, -starts_with("PL_"), - starts_with("LP_"))
FGL4_PLEC_train_CV <- select(FGL4_train_CV, -starts_with("PL_"), - starts_with("LP_"))
smote_PLEC_train <- select(train_smote, -starts_with("PL_"), - starts_with("LP_"))
Step 2. Function to calculate AUC as an evaluation metric, get best threshold and plot ROC curves
# Load workspace from previous step
load(".RData")
library(keras)
#install_keras()
library(pROC)
kerasAUC <- function(y_true, y_pred){
true= k_flatten(y_true)
pred = k_flatten(y_pred)
#total number of elements in this batch
totalCount = k_shape(true)[1]
#sorting the prediction values in descending order
values = tensorflow::tf$nn$top_k(pred,k=totalCount)
indices<-values[[1]]
values<-values[[0]]
#sorting the ground truth values based on the predictions above
sortedTrue = k_gather(true, indices)
#getting the ground negative elements (already sorted above)
negatives = 1 - sortedTrue
#the true positive count per threshold
TPCurve = k_cumsum(sortedTrue)
#area under the curve
auc = k_sum(TPCurve * negatives)
#normalizing the result between 0 and 1
totalCount = k_cast(totalCount, k_floatx())
positiveCount = k_sum(true)
negativeCount = totalCount - positiveCount
totalArea = positiveCount * negativeCount
return (auc / totalArea)
}
ROC_threshold_plots_tables <- function(truth_train, prediction_train,
truth_test, prediction_test,
threshold = 0, pos = "Pulsar", neg = "Non-Pulsar", print_table=TRUE) {
if (threshold == 0) {
# Compute best threshold if none is provided
ROC <- roc(truth_train, prediction_train, levels = c(pos,neg))
ROC_table <- cbind(ROC$thresholds, ROC$specificities, ROC$sensitivities)
best_threshold <- ROC_table[which.max(ROC_table[, 2] + ROC_table[, 3]), ]
} else {
# Don't compute threshold if one is provided
best_threshold <- threshold
}
if (nargs() > 2) {
# Make ROC plots
par(mfrow = c(1, 2))
ROC_train <- roc(truth_train, prediction_train, levels = c(pos,neg),
plot = TRUE, print.auc = TRUE, main = "ROC Train", print.thres = "best")
ROC_test <- roc(truth_test, prediction_test, levels = c(pos,neg),
plot = TRUE, print.auc = TRUE, main = "ROC Test", print.thres = "best")
if (print_table) {
# Generate contingency tables
# Train data
nrow_train <- length(prediction_train)
Predict_class_train <- rep("NA", nrow_train)
Predict_class_train <- ifelse(prediction_train > best_threshold[1], pos, neg)
real_category <- truth_train
print(table(Predict_class_train, real_category))
# Test data
nrow_test <- length(prediction_test)
Predict_class_test <- rep("NA", nrow_test)
Predict_class_test <- ifelse(prediction_test > best_threshold[1], pos, neg)
real_category <- truth_test
print(table(Predict_class_test, real_category))
}
}
return(best_threshold)
}
# Save current workspace for subsequent steps
save.image()
Step 3. AGN vs PSR using Boosted Logistic Regression
library(RWeka)
library(party)
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:keras':
##
## fit
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##### ##### ##### ##### ##### ##### ##### ##### #####
# Choose the group of variable using PLEC spectrum type
Model <- LogitBoost(pulsarness ~., data = FGL4_PLEC_train)
# summary(Model)
# Model predictions
predictions_FGL4_train <- predict(Model, newdata = FGL4_PLEC_train, type = "probability")[, 2]
predictions_FGL4_test <- predict(Model, newdata = FGL4_PLEC_test, type = "probability")[, 2]
# Generate ROC plots and print contingency tables
Best_threshold_train <- ROC_threshold_plots_tables(FGL4_PLEC_train$pulsarness,
predictions_FGL4_train,
FGL4_PLEC_test$pulsarness,
predictions_FGL4_test)
## Setting direction: controls > cases
## Setting direction: controls > cases
## Setting direction: controls > cases
## real_category
## Predict_class_train Non-Pulsar Pulsar
## Non-Pulsar 2178 7
## Pulsar 98 178
## real_category
## Predict_class_test Non-Pulsar Pulsar
## Non-Pulsar 950 4
## Pulsar 42 59
FGL4_BLR_PLEC_test <- ifelse (predictions_FGL4_test > Best_threshold_train[1],
"Pulsar", "Non-Pulsar")
cm_BLR_PLEC<- confusionMatrix(factor(FGL4_BLR_PLEC_test),
factor(FGL4_PLEC_test$pulsarness), positive="Pulsar")
print(cm_BLR_PLEC)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Non-Pulsar Pulsar
## Non-Pulsar 950 4
## Pulsar 42 59
##
## Accuracy : 0.9564
## 95% CI : (0.9423, 0.9679)
## No Information Rate : 0.9403
## P-Value [Acc > NIR] : 0.01312
##
## Kappa : 0.6972
##
## Mcnemar's Test P-Value : 4.888e-08
##
## Sensitivity : 0.93651
## Specificity : 0.95766
## Pos Pred Value : 0.58416
## Neg Pred Value : 0.99581
## Prevalence : 0.05972
## Detection Rate : 0.05592
## Detection Prevalence : 0.09573
## Balanced Accuracy : 0.94708
##
## 'Positive' Class : Pulsar
##
# Add LR Prediction Probabilities to FGL4_results data frame:
FGL4_results$BLR_P_PLEC <- round(predict(Model, newdata = FGL4_tidy, type = "probability")[, 2], digits = 3)
# Add LR Prediction category to FGL4_results data frame:
FGL4_results$BLR_Pred_PLEC <- ifelse(FGL4_results$BLR_P_PLEC > Best_threshold_train[1],
"PSR", "AGN")
##### ##### ##### ##### ##### ##### ##### ##### #####
# Choose the group of variable using LP spectrum type
Model <- LogitBoost(pulsarness ~., data = FGL4_LP_train)
# summary(Model)
# Model predictions
predictions_FGL4_train <- predict(Model, newdata = FGL4_LP_train, type = "probability")[, 2]
predictions_FGL4_test <- predict(Model, newdata = FGL4_LP_test, type = "probability")[, 2]
# Generate ROC plots and print contingency tables
Best_threshold_train <- ROC_threshold_plots_tables(FGL4_LP_train$pulsarness,
predictions_FGL4_train,
FGL4_LP_test$pulsarness,
predictions_FGL4_test)
## Setting direction: controls > cases
## Setting direction: controls > cases
## Setting direction: controls > cases
## real_category
## Predict_class_train Non-Pulsar Pulsar
## Non-Pulsar 2215 10
## Pulsar 61 175
## real_category
## Predict_class_test Non-Pulsar Pulsar
## Non-Pulsar 968 5
## Pulsar 24 58
FGL4_BLR_LP_test <- ifelse (predictions_FGL4_test > Best_threshold_train[1],
"Pulsar", "Non-Pulsar")
cm_BLR_LP<- confusionMatrix(factor(FGL4_BLR_LP_test),
factor(FGL4_LP_test$pulsarness), positive="Pulsar")
print(cm_BLR_LP)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Non-Pulsar Pulsar
## Non-Pulsar 968 5
## Pulsar 24 58
##
## Accuracy : 0.9725
## 95% CI : (0.9608, 0.9815)
## No Information Rate : 0.9403
## P-Value [Acc > NIR] : 7.519e-07
##
## Kappa : 0.7855
##
## Mcnemar's Test P-Value : 0.0008302
##
## Sensitivity : 0.92063
## Specificity : 0.97581
## Pos Pred Value : 0.70732
## Neg Pred Value : 0.99486
## Prevalence : 0.05972
## Detection Rate : 0.05498
## Detection Prevalence : 0.07773
## Balanced Accuracy : 0.94822
##
## 'Positive' Class : Pulsar
##
# Add LR Prediction Probabilities to FGL4_results data frame:
FGL4_results$BLR_P_LP <- round(predict(Model, newdata = FGL4_tidy, type = "probability")[, 2], digits = 3)
# Add LR Prediction category to FGL4_results data frame:
FGL4_results$BLR_Pred_LP <- ifelse(FGL4_results$BLR_P_LP > Best_threshold_train[1],
"PSR", "AGN")
# Clean up and Set aside required data sets
Environ <- ls()
Environ <- Environ[Environ != "FGL4_tidy"
& Environ != "train_idx"
& Environ != "FGL4_AGNPSR"
& Environ != "FGL4_results"
& Environ != "FGL4_test"
& Environ != "FGL4_train"
& Environ != "FGL4_LP_train"
& Environ != "FGL4_LP_test"
& Environ != "FGL4_PLEC_train"
& Environ != "FGL4_PLEC_test"
& Environ != "smote_PLEC_train"
& Environ != "smote_LP_train"
& Environ != "predictions_FGL4_train_CV" & Environ != "Block_index"
& Environ != "FGL4_train_CV"
& Environ != "FGL4_LP_train_CV"
& Environ != "FGL4_PLEC_train_CV"
& Environ != "ROC_threshold_plots_tables" & Environ != "FGL4_Pulsars_train"
& Environ != "FGL4_Pulsars_test"]
rm(list = Environ)
# Save current workspace for subsequent steps
save.image()
Step 4. AGN vs PSR classification using Random Forests (RF)
# AGN vs PSR classification using Random Forests (RF) # Load workspace from previous step
# Load randomForest package
library(randomForest)
set.seed(2)
##### ##### ##### ##### ##### ##### ##### ##### #####
# Choose the group of variable using LP spectrum type
rf.full.FGL4.LP <- randomForest(pulsarness~., data = smote_LP_train, importance = TRUE)
# The importance of each variable
importance(rf.full.FGL4.LP)
## Non-Pulsar Pulsar MeanDecreaseAccuracy MeanDecreaseGini
## LP_Index 17.803824 24.27911 26.29255 31.77668
## Variability_Index 17.735934 20.67549 25.75509 129.89654
## Frac_Variability 24.238532 33.53784 40.29818 280.52349
## Pivot_Energy 17.970767 20.48269 25.24394 38.12102
## LP_Flux_Density 12.060297 12.63981 16.24734 89.73547
## Unc_PL_Flux_Density 9.453351 12.70249 15.21665 30.09057
## LP_beta 27.699408 42.70506 47.63881 455.63003
## LP_SigCurv 26.539370 28.33617 35.53527 541.69987
## Npred 21.090354 21.85144 26.98389 171.66766
## Unc_Energy_Flux100 37.958736 30.52581 47.13364 134.95325
## hr12 15.714645 15.91438 19.58856 24.46861
## hr23 13.256833 15.85425 18.52541 25.03761
## hr34 14.995749 13.47151 16.71457 20.11207
## hr45 17.280349 25.57382 26.88047 204.35840
## hr56 14.120659 17.52732 20.03283 15.08260
## Signif 21.734273 21.41250 25.81852 115.14791
varImpPlot(rf.full.FGL4.LP)
# Prediction for train and test
predictions_FGL4_LP_train <- predict(rf.full.FGL4.LP, newdata = smote_LP_train, type = "Prob")[, 2]
predictions_FGL4_LP_test <- predict(rf.full.FGL4.LP, newdata = FGL4_LP_test, type = "Prob")[,2 ]
# Generate ROC plots, and print contingency tables
Best_threshold_train <- ROC_threshold_plots_tables(smote_LP_train$pulsarness,
predictions_FGL4_LP_train,
FGL4_test$pulsarness,
predictions_FGL4_LP_test)
## real_category
## Predict_class_train Non-Pulsar Pulsar
## Non-Pulsar 2220 0
## Pulsar 0 2405
## real_category
## Predict_class_test Non-Pulsar Pulsar
## Non-Pulsar 986 9
## Pulsar 6 54
cat("Best threshold from RF:", Best_threshold_train)
## Best threshold from RF: 0.558 1 1
# Get the confusion matrix and statistics
predictions_FGL4_LP_test <- ifelse (predictions_FGL4_LP_test > Best_threshold_train[1], "Pulsar", "Non-Pulsar")
cm_RF_LP<- confusionMatrix(factor(predictions_FGL4_LP_test), factor(FGL4_PLEC_test$pulsarness), positive="Pulsar")
print(cm_RF_LP)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Non-Pulsar Pulsar
## Non-Pulsar 986 9
## Pulsar 6 54
##
## Accuracy : 0.9858
## 95% CI : (0.9767, 0.992)
## No Information Rate : 0.9403
## P-Value [Acc > NIR] : 1.346e-13
##
## Kappa : 0.8705
##
## Mcnemar's Test P-Value : 0.6056
##
## Sensitivity : 0.85714
## Specificity : 0.99395
## Pos Pred Value : 0.90000
## Neg Pred Value : 0.99095
## Prevalence : 0.05972
## Detection Rate : 0.05118
## Detection Prevalence : 0.05687
## Balanced Accuracy : 0.92555
##
## 'Positive' Class : Pulsar
##
# Add RF Prediction probabilities to FGL4_results
FGL4_results$RF_P_LP <- predict(rf.full.FGL4.LP, newdata = FGL4_tidy, type = "Prob")[,2 ]
# Add RF Prediction category to FGL4_results
FGL4_results$RF_Pred_LP <- ifelse(FGL4_results$RF_P_LP > Best_threshold_train[1],
"PSR", "AGN")
#par(mfrow = c(1,1))
#varImpPlot(rf.full.FGL4.LP, pch = 19, type = 1, main = "")
##### ##### ##### ##### ##### ##### ##### ##### #####
# Choose the group of variable using PLEC spectrum type
# Now Modeling using all the data
rf.full.FGL4.PLEC <- randomForest(pulsarness~., data = smote_PLEC_train, importance = TRUE)
# The importance of each variable
importance(rf.full.FGL4.PLEC)
## Non-Pulsar Pulsar MeanDecreaseAccuracy MeanDecreaseGini
## PLEC_Index 18.944722 19.125709 23.131858 168.0759139
## Variability_Index 18.821615 20.099989 25.781404 114.0761693
## Frac_Variability 26.748593 33.792278 39.816004 281.0334698
## Pivot_Energy 19.933050 21.047393 27.770332 40.2494910
## PLEC_Flux_Density 12.403828 12.086922 15.073942 80.3798862
## Unc_PL_Flux_Density 10.079361 14.238939 16.060753 28.7315378
## PLEC_Expfactor 24.672511 30.671496 35.778698 397.5668142
## PLEC_Exp_Index 2.667066 2.722971 3.146663 0.6004715
## PLEC_SigCurv 24.310805 27.243626 33.856888 512.0953220
## Npred 23.263048 22.711006 28.899681 176.1675843
## Unc_Energy_Flux100 41.585116 37.022316 49.727464 139.0983600
## hr12 16.262667 16.649248 21.618039 22.5444622
## hr23 12.914681 16.077467 18.384346 24.9949436
## hr34 16.585442 14.704431 18.346679 22.6165609
## hr45 19.967886 23.112523 24.732052 190.3019048
## hr56 14.805873 17.254664 20.208062 15.5765123
## Signif 23.849538 20.910192 26.168898 93.9285501
varImpPlot(rf.full.FGL4.PLEC)
# Prediction for train and test
predictions_FGL4_PLEC_train <- predict(rf.full.FGL4.PLEC, newdata = smote_PLEC_train, type = "Prob")[, 2]
predictions_FGL4_PLEC_test <- predict(rf.full.FGL4.PLEC, newdata = FGL4_PLEC_test, type = "Prob")[,2 ]
# Generate ROC plots, and print contingency tables
Best_threshold_train <- ROC_threshold_plots_tables(smote_PLEC_train$pulsarness,
predictions_FGL4_LP_train,
FGL4_test$pulsarness,
predictions_FGL4_PLEC_test)
## real_category
## Predict_class_train Non-Pulsar Pulsar
## Non-Pulsar 2220 0
## Pulsar 0 2405
## real_category
## Predict_class_test Non-Pulsar Pulsar
## Non-Pulsar 985 9
## Pulsar 7 54
cat("Best threshold from RF:", Best_threshold_train)
## Best threshold from RF: 0.558 1 1
# Get the confusion matrix and statistics
predictions_FGL4_PLEC_test <- ifelse (predictions_FGL4_PLEC_test > Best_threshold_train[1], "Pulsar", "Non-Pulsar")
cm_RF_PLEC<- confusionMatrix(factor(predictions_FGL4_LP_test), factor(FGL4_PLEC_test$pulsarness), positive="Pulsar")
print(cm_RF_PLEC)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Non-Pulsar Pulsar
## Non-Pulsar 986 9
## Pulsar 6 54
##
## Accuracy : 0.9858
## 95% CI : (0.9767, 0.992)
## No Information Rate : 0.9403
## P-Value [Acc > NIR] : 1.346e-13
##
## Kappa : 0.8705
##
## Mcnemar's Test P-Value : 0.6056
##
## Sensitivity : 0.85714
## Specificity : 0.99395
## Pos Pred Value : 0.90000
## Neg Pred Value : 0.99095
## Prevalence : 0.05972
## Detection Rate : 0.05118
## Detection Prevalence : 0.05687
## Balanced Accuracy : 0.92555
##
## 'Positive' Class : Pulsar
##
# Add RF Prediction probabilities to FGL4_results
FGL4_results$RF_P_LP <- predict(rf.full.FGL4.LP, newdata = FGL4_tidy, type = "Prob")[,2 ]
# Add RF Prediction category to FGL4_results
FGL4_results$RF_Pred_LP <- ifelse(FGL4_results$RF_P_LP > Best_threshold_train[1],
"PSR", "AGN")
#par(mfrow = c(1,1))
#varImpPlot(rf.full.FGL4.PLEC, pch = 19, type = 1, main = "")
Step 5. Outlyingness
# Computation of AGN and PSR "outlyingness" using Random Forests (RF)
# Load workspace from previous step
# load(".RData")
# Load randomForest package
library(randomForest)
# Set random seed
set.seed(1)
# Compute RF predictions on entire FGL4 data
results <- predict(rf.full.FGL4.PLEC, newdata = FGL4_tidy, proximity = TRUE, type = "Prob")
# Use proximity matrix to compute PSR and AGN "outlyingness"
nobs <- dim(FGL4_tidy)[1]
PSR_Out <- numeric(nobs)
AGN_Out <- numeric(nobs)
for (i in 1:nobs ) {
PSR_proximities <- results$proximity[FGL4_tidy$pulsarness == "Pulsar", i]
PSR_Out[i] <- 1./sum(PSR_proximities^2)
AGN_proximities <- results$proximity[FGL4_tidy$agnness == "AGN", i]
AGN_Out[i] <- 1./sum(AGN_proximities^2)
}
# Normalize outlyingness
PSR_Out_norm <- (PSR_Out - median(PSR_Out[FGL4_tidy$pulsarness == "Pulsar"])) /
mad(PSR_Out[FGL4_tidy$pulsarness=="Pulsar"])
AGN_Out_norm <- (AGN_Out - median(AGN_Out[FGL4_tidy$agnness == "AGN"])) /
mad(AGN_Out[FGL4_tidy$agnness=="AGN"])
# Add outlyingness to FGL3_results data frame
FGL4_results$PSR_Out <- round(PSR_Out_norm, digits = 3)
FGL4_results$AGN_Out <- round(AGN_Out_norm, digits = 3)
plot(FGL4_results$PSR_Out, FGL4_results$AGN_Out,
xlim = c(0.1,900), ylim = c(0.1,3000),
xlab = "PSR Outlyingness",
ylab = "AGN Outlyingness",
log = "xy",
type = "n")
legend(0.1, 6, c("AGN", "Pulsars", "Non-AGN/Non-Pulsars"), pch = c(17, 19, 1))
points(FGL4_results$PSR_Out[FGL4_tidy$agnness=="AGN"],
FGL4_results$AGN_Out[FGL4_tidy$agnness=="AGN"],
pch = 17)
points(FGL4_results$PSR_Out[FGL4_tidy$pulsarness=="Pulsar"],
FGL4_results$AGN_Out[FGL4_tidy$pulsarness=="Pulsar"],
pch = 19)
points(FGL4_results$PSR_Out[(FGL4_tidy$agnness!="AGN")&(FGL4_tidy$pulsarness!="Pulsar")],
pch = 1)
abline(h = 10, lty = 2, lwd = 3)
abline(v = 10, lty = 2, lwd = 3)
# Clean up and Set aside required data sets are not shown
Step 6. AGN vs PSR classification using XGBoost
# AGN vs PSR classification using XGBoost
# Load workspace from previous step
# load(".RData")
# Load XGBoost package
library(xgboost)
library(caret)
# require(e1071) # require downloading for the first time
# require(Ckmeans.1d.dp) # require downloading for the first time
pulsarness_labels <- select(FGL4_AGNPSR, pulsarness)
#convert labels to pulsar = 1, non-pulsar = 0
pulsarness_labels$pulsarness <- as.numeric(pulsarness_labels$pulsarness=="Pulsar")
FGL4_AGNPSR_test_labels <- as.matrix(pulsarness_labels[-train_idx,])
##### ##### ##### ##### ##### ##### ##### ##### #####
# select the group of variable using PLEC spectrum type
FGL4_AGNPSR_PLEC <- select(FGL4_AGNPSR, -starts_with("PL_"), - starts_with("LP_"), -pulsarness)
FGL4_AGNPSR_PLEC_train <- as.matrix(FGL4_AGNPSR_PLEC[train_idx, ])
FGL4_AGNPSR_PLEC_test <- as.matrix(FGL4_AGNPSR_PLEC[-train_idx, ])
# put testing & training data into two seperates Dmatrixs objects
dtest_PLEC <- xgb.DMatrix(data = FGL4_AGNPSR_PLEC_test, label= FGL4_AGNPSR_test_labels)
##### ##### ##### ##### ##### ##### ##### ##### #####
# select the group of variable using LP spectrum type
FGL4_AGNPSR_LP <- select(FGL4_AGNPSR, -starts_with("PLEC_"), - starts_with("PL_"), -pulsarness)
FGL4_AGNPSR_LP_test <- as.matrix(FGL4_AGNPSR_LP[-train_idx, ])
FGL4_AGNPSR_LP_train <- as.matrix(FGL4_AGNPSR_LP[train_idx, ])
# put testing & training data into two seperates Dmatrixs objects
dtest_LP <- xgb.DMatrix(data = FGL4_AGNPSR_LP_test, label= FGL4_AGNPSR_test_labels)
##### ##### ##### ##### ##### ##### ##### ##### #####
# Use SMOTE
smote_AGNPSR_train_labels <- as.matrix(as.numeric(smote_PLEC_train$pulsarness=="Pulsar"))
agn_cases <- sum(smote_AGNPSR_train_labels == 0)
psr_cases <- sum(smote_AGNPSR_train_labels == 1)
smote_AGNPSR_PLEC_train <- as.matrix(select(smote_PLEC_train, -pulsarness))
dtrain_PLEC <- xgb.DMatrix(data = smote_AGNPSR_PLEC_train, label= smote_AGNPSR_train_labels)
smote_AGNPSR_LP_train <- as.matrix(select(smote_LP_train, -pulsarness))
dtrain_LP <- xgb.DMatrix(data = smote_AGNPSR_LP_train, label= smote_AGNPSR_train_labels)
########### Hyper-Parameter Tuning ##########
# default parameters to find best no. of iteration
params <- list(booster = "gbtree",
objective = "binary:logistic",
eval_metric = "auc", # "error",
eta=0.1, max_depth=6, min_child_weight=1, gamma=1, lambda = 0,
# scale_pos_weight = agn_cases / psr_cases, # control for imbalanced classes
maximize = F ,
subsample=1,
colsample_bytree=1
)
# Using the inbuilt xgb.cv function to calculate the best nround for this model.
# In addition, this function also returns CV error, which is an estimate of test error.
xgbcv_PLEC <- xgb.cv( params = params, data = dtrain_PLEC,
nrounds = 100, nfold = 10, showsd = T,
print_every_n = 20 , stratified = T, early_stopping_rounds = 20,
)
## [1] train-auc:0.997505+0.000488 test-auc:0.990877+0.005510
## Multiple eval metrics are present. Will use test_auc for early stopping.
## Will train until test_auc hasn't improved in 20 rounds.
##
## [21] train-auc:0.999646+0.000204 test-auc:0.997716+0.001608
## [41] train-auc:0.999984+0.000018 test-auc:0.998640+0.001147
## [61] train-auc:0.999999+0.000002 test-auc:0.999278+0.000396
## [81] train-auc:1.000000+0.000000 test-auc:0.999328+0.000397
## Stopping. Best iteration:
## [72] train-auc:1.000000+0.000000 test-auc:0.999336+0.000394
best_iter <- 100
if (!is.null(xgbcv_PLEC$best_iteration )) {best_iter <- xgbcv_PLEC$best_iteration}
print(paste("best_iteration PLEC:", best_iter))
## [1] "best_iteration PLEC: 72"
# Gradient boost model training
xgb1_PLEC <- xgb.train (params = params,
data = dtrain_PLEC,
nrounds = best_iter,
# early_stopping_rounds = 20,
print_every_n = 20 ,
watchlist = list(val=dtest_PLEC, train=dtrain_PLEC)
)
## [1] val-auc:0.958957 train-auc:0.997208
## [21] val-auc:0.990751 train-auc:0.999627
## [41] val-auc:0.990591 train-auc:0.999993
## [61] val-auc:0.990575 train-auc:1.000000
## [72] val-auc:0.991263 train-auc:1.000000
xgb.save(xgb1_PLEC,'xgb_PLEC.model')
## [1] TRUE
# xgb1_PLEC <- xgb.load('xgb_PLEC.model')
# Using package 'mlr to search for best parameter set
#convert characters to factors
library(mlr)
#create tasks
traintask <- makeClassifTask (data = FGL4_PLEC_train, target = "pulsarness")
testtask <- makeClassifTask (data = FGL4_PLEC_test, target = "pulsarness")
#create learner
lrn <- makeLearner("classif.xgboost", predict.type = "response")
lrn$par.vals <- list( objective="binary:logistic", eval_metric="auc",
nrounds=best_iter, # early_stopping_rounds = 40,
# scale_pos_weight = agn_cases / psr_cases,
print_every_n = 20)
#set parameter space
params_set <- makeParamSet( makeDiscreteParam("booster",values = c("gbtree")),
makeIntegerParam("max_depth", lower = 3L, upper = 10L),
makeNumericParam("min_child_weight", lower = 1L, upper = 10L),
makeNumericParam("subsample", lower = 0.7, upper = 1),
makeNumericParam("colsample_bytree", lower = 0.5, upper = 1),
makeNumericParam("lambda", lower=0.75, upper=1.25),
makeNumericParam("gamma", lower=0, upper=1.5),
makeNumericParam("eta", lower = 0.01, upper=0.3))
#set resampling strategy
rdesc <- makeResampleDesc("CV", stratify = T, iters=10L)
#search strategy
ctrl <- makeTuneControlRandom(maxit = 10L)
# Set a parallel backend to ensure faster computation.
# Make sure not to open several applications in backend.
library(parallel)
library(parallelMap)
#parallelStartSocket(cpus = 4) # detectCores()) # does not work in R 4.x
#parallelStop()
#parameter tuning
mytune <- tuneParams(learner = lrn, task = traintask,
resampling = rdesc,
measures = acc, par.set = params_set,
control = ctrl, show.info = T)
# Save a single object to a file
saveRDS(mytune, "mytune.rds")
# set hyperparameters
# lrn_tune <- setHyperPars(lrn, par.vals = mytune$x)
# train model
# xgmodel <- train(learner = lrn_tune, task = traintask)
# predict model
# xgpred <- predict(xgmodel, testtask)
# Restore mytune
#mytune <- readRDS("mytune.rds")
#mytune$x
set.seed(3)
params <- list(booster = "gbtree",
objective = "binary:logistic",
eval_metric = "auc",
eta = 0.05,
max_depth = 6,
min_child_weight = 8,
lambda = 1.1,
gamma = 1, # add a regularization term
# scale_pos_weight = agn_cases / psr_cases, # control for imbalanced classes
subsample = 0.7,
colsample_bytree = 0.9)
# Gradient boost model training
xgb1_PLEC <- xgb.train (params = params,
data = dtrain_PLEC,
nrounds = 500,
watchlist = list(val=dtest_PLEC, train=dtrain_PLEC),
print_every_n = 10,
early_stopping_rounds = 40,
maximize = TRUE
)
xgb.save(xgb1_PLEC,'xgb_PLEC.model')
xgb1_PLEC<- xgb.load('xgb_PLEC.model')
# model prediction
xgbpred_PLEC_test <- predict (xgb1_PLEC, dtest_PLEC)
xgbpred_PLEC_train <- predict (xgb1_PLEC, dtrain_PLEC)
# xgbpred1 <- ifelse (xgbpred1 > 0.5,1,0)
# err <- mean(as.numeric(xgbpred1 > 0.5) != FGL4_AGNPSR_test_labels)
# print(paste("model 1 test-error=", err))
# Get Training Set threshold, generate ROC plots, and print contingency tables
Best_threshold_train_PLEC <- ROC_threshold_plots_tables(smote_AGNPSR_train_labels, #FGL4_AGNPSR_train_labels,
xgbpred_PLEC_train,
FGL4_AGNPSR_test_labels,
xgbpred_PLEC_test, pos="1", neg = "0")
## real_category
## Predict_class_train 0 1
## 0 2220 0
## 1 0 2405
## real_category
## Predict_class_test 0 1
## 0 984 8
## 1 8 55
#Predict_class_test 0 1
# 0 982 6
# 1 10 57
cat("Best threshold from xgBoost:", Best_threshold_train_PLEC)
## Best threshold from xgBoost: 0.5802544 1 1
# Get the confusion matrix and statistics
xgbpred_PLEC_test <- ifelse (xgbpred_PLEC_test > Best_threshold_train_PLEC[1],1,0)
cm_xgb_PLEC<- confusionMatrix(factor(xgbpred_PLEC_test), factor(FGL4_AGNPSR_test_labels), positive="1")
print(cm_xgb_PLEC)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 984 8
## 1 8 55
##
## Accuracy : 0.9848
## 95% CI : (0.9755, 0.9913)
## No Information Rate : 0.9403
## P-Value [Acc > NIR] : 5.666e-13
##
## Kappa : 0.865
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.87302
## Specificity : 0.99194
## Pos Pred Value : 0.87302
## Neg Pred Value : 0.99194
## Prevalence : 0.05972
## Detection Rate : 0.05213
## Detection Prevalence : 0.05972
## Balanced Accuracy : 0.93248
##
## 'Positive' Class : 1
##
# Add XGBoost Prediction probabilities to FGL4_results
FGL4_tidy_num <- select(FGL4_tidy, -CLASS1, -ASSOC1, -ASSOC2, -CLASS2,
-SpectrumType, -Source_Name, -ASSOC_FGL, -GLAT, -Conf_95_PosAng, -agnness)
FGL4_tidy_num_PLEC <- select(FGL4_tidy_num, -starts_with("PL_"), - starts_with("LP_"), -pulsarness)
FGL4_tidy_num_PLEC_labels <- as.matrix(rep(0,nrow(FGL4_tidy_num_PLEC)))
dFGL4_tidy_PLEC <- xgb.DMatrix(data = as.matrix(as.data.frame(FGL4_tidy_num_PLEC)),
label= FGL4_tidy_num_PLEC_labels)
FGL4_results$XGB_P_PLEC <- predict(xgb1_PLEC, newdata = dFGL4_tidy_PLEC, type = "Prob")
FGL4_results$XGB_Pred_PLEC <- ifelse (FGL4_results$XGB_P_PLEC > Best_threshold_train_PLEC[1],c("PSR"), c("AGN"))
# Plot
par(mfrow = c(1,1))
#view variable importance plot
mat <- xgb.importance (feature_names = names(FGL4_AGNPSR_PLEC), model = xgb1_PLEC)
xgb.ggplot.importance(importance_matrix = mat)
xgb1_LP<- xgb.load('xgb_LP.model')
# model prediction
xgbpred_LP_test <- predict (xgb1_LP, dtest_LP)
xgbpred_LP_train <- predict (xgb1_LP, dtrain_LP)
# xgbpred1 <- ifelse (xgbpred1 > 0.5,1,0)
# err <- mean(as.numeric(xgbpred1 > 0.5) != FGL4_AGNPSR_test_labels)
# print(paste("model 1 test-error=", err))
# Get Training Set threshold, generate ROC plots, and print contingency tables
Best_threshold_train_LP <- ROC_threshold_plots_tables(smote_AGNPSR_train_labels, #FGL4_AGNPSR_train_labels,
xgbpred_LP_train,
FGL4_AGNPSR_test_labels,
xgbpred_LP_test, pos="1", neg = "0")
## real_category
## Predict_class_train 0 1
## 0 2214 3
## 1 6 2402
## real_category
## Predict_class_test 0 1
## 0 981 6
## 1 11 57
cat("Best threshold from xgBoost for LP:", Best_threshold_train_LP)
## Best threshold from xgBoost for LP: 0.578409 0.9987526 0.9972973
# Get the confusion matrix and statistics
xgbpred_LP_test <- ifelse (xgbpred_LP_test > Best_threshold_train_LP[1],1,0)
cm_xgb_LP<- confusionMatrix(factor(xgbpred_LP_test), factor(FGL4_AGNPSR_test_labels), positive="1")
print(cm_xgb_LP)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 981 6
## 1 11 57
##
## Accuracy : 0.9839
## 95% CI : (0.9743, 0.9906)
## No Information Rate : 0.9403
## P-Value [Acc > NIR] : 2.244e-12
##
## Kappa : 0.8617
##
## Mcnemar's Test P-Value : 0.332
##
## Sensitivity : 0.90476
## Specificity : 0.98891
## Pos Pred Value : 0.83824
## Neg Pred Value : 0.99392
## Prevalence : 0.05972
## Detection Rate : 0.05403
## Detection Prevalence : 0.06445
## Balanced Accuracy : 0.94684
##
## 'Positive' Class : 1
##
# Add XGBoost Prediction probabilities to FGL4_results
FGL4_tidy_num_LP <- select(FGL4_tidy_num, -starts_with("PL_"), - starts_with("PLEC_"), -pulsarness)
FGL4_tidy_num_LP_labels <- as.matrix(rep(0,nrow(FGL4_tidy_num_LP)))
dFGL4_tidy_LP <- xgb.DMatrix(data = as.matrix(as.data.frame(FGL4_tidy_num_LP)),
label= FGL4_tidy_num_LP_labels)
FGL4_results$XGB_P_LP <- predict(xgb1_PLEC, newdata = dFGL4_tidy_LP, type = "Prob")
FGL4_results$XGB_Pred_LP <- ifelse (FGL4_results$XGB_P_LP > Best_threshold_train_LP[1],c("PSR"), c("AGN"))
# Plot
par(mfrow = c(1,1))
#view variable importance plot
mat <- xgb.importance (feature_names = names(FGL4_AGNPSR_LP), model = xgb1_LP)
xgb.ggplot.importance(importance_matrix = mat)
save(FGL4_results, file = "FGL4_results.rData", compress = "xz")