Classification On 4fgl (part1)

Classification on 4FGL (Part1)

Source

“FGL4.rData” are downloaded from Fermi LAT data products (https://fermi.gsfc.nasa.gov/ssc/data/access/).

Reference publication:

“Classification and Ranking of Fermi LAT Gamma-ray Sources from the 3FGL Catalog Using Machine Learning Techniques”, Saz Parkinson, P. M. (HKU/LSR, SCIPP), Xu, H. (HKU), Yu, P. L. H. (HKU), Salvetti, D. (INAF-Milan), Marelli, M. (INAF-Milan), and Falcone, A. D. (Penn State), The Astrophysical Journal, 2016. (http://arxiv.org/abs/1602.00385)

Difference on feature selection between 4FGL & 3FGL results

4FGL field	3FGL field	Note
PL_Flux_Density	Flux_Density	simple power-law form
LP_Flux_Density		LogParabola form, for all significantly curved spectra except pulsars
PLEC_Flux_Density		PLSuperExpCutoff form, a subexponentially cutoff power law for all significantly curved pulsars
LP_SigCurv	Signif_Curve	Significance (in σ units) of the fit improvement between PowerLaw and PLSuperExpCutoff. A value greater than 4 indicates significant curvature
PLEC SigCurv
Flux_Band.2	Flux100_300
Flux_Band.3	Flux300_1000
Flux_Band.4	Flux1000_3000
Flux_Band.5	Flux3000_10000
Flux_Band.6	Flux10000_100000	replace 10-100 GeV with 10 - 30GeV
Flux_Band.7	Flux10000_100000	replace 10-100 GeV with 30 - 300 GeV

Step 1. Load fermicatsR package with added 4th Fermi LAT Catalogs; Split data

library(devtools)
load_all('fermicatsR') # load package w/o installing
library(fermicatsR)
library(dplyr)
library(caret)
library(DMwR) 
load("fermicatsR/FGL4_results.rData")
load("fermicatsR/FGL4_tidy.rData")

# Set Random seed
set.seed(2)

# Select data for PSR (pulsarness == "Pulsar") vs AGN (agnness == "AGN") classification
FGL4_AGNPSR <- filter(FGL4_tidy, pulsarness == "Pulsar" | agnness == "AGN")

# Drop sources with missing values and some unused variables
FGL4_AGNPSR <- na.omit(FGL4_AGNPSR)
FGL4_AGNPSR <- select(FGL4_AGNPSR, -CLASS1, -ASSOC1, -ASSOC2, -CLASS2, 
                      -SpectrumType, -Source_Name, -ASSOC_FGL, -GLAT, -Conf_95_PosAng, -agnness)

# Separate into training (70%) and testing (30%) sets
train_idx <- sample(nrow(FGL4_AGNPSR), round(nrow(FGL4_AGNPSR)*0.7)) 
FGL4_test <- FGL4_AGNPSR[-train_idx, ]
FGL4_train <- FGL4_AGNPSR[train_idx, ]
table(FGL4_train$pulsarness)

## 
## Non-Pulsar     Pulsar 
##       2276        185

### SMOTE
# Very imbalanced dataset, so let's see if using smote can improve this model.
i <- grep("pulsarness", colnames(FGL4_train)) # Get index Class column
train_smote <- SMOTE(pulsarness ~ ., as.data.frame(FGL4_train), 
                     perc.over = 1200, perc.under=100, #learner='rpartXse',se=0.5
                     )
table(train_smote$pulsarness)

## 
## Non-Pulsar     Pulsar 
##       2220       2405

# table(train_smote$pulsarness) 

# Split training data into k (10) blocks for k-fold cross-validation
k <- 10
Index <- sample(nrow(FGL4_train))
Block_index <- matrix(data = Index[1:(floor(nrow(FGL4_train)/k)*k)], nrow = k, byrow = TRUE) 
FGL4_train_CV <- FGL4_train[Index[1:(floor(nrow(FGL4_train)/k)*k)], ]
# Initialise predictions vector
predictions_FGL4_train_CV <- rep(0, nrow(FGL4_train_CV))

# select the group of variable using LP spectrum type
FGL4_LP_train <-  select(FGL4_train, -starts_with("PLEC_"), - starts_with("PL_"))
FGL4_LP_test <- select(FGL4_test, -starts_with("PLEC_"), - starts_with("PL_"))
FGL4_LP_train_CV <- select(FGL4_train_CV, -starts_with("PLEC_"), - starts_with("PL_"))
smote_LP_train <-  select(train_smote, -starts_with("PLEC_"), - starts_with("PL_"))

# select the group of variable using PLEC spectrum type
FGL4_PLEC_train <-  select(FGL4_train, -starts_with("PL_"), - starts_with("LP_"))
FGL4_PLEC_test <- select(FGL4_test, -starts_with("PL_"), - starts_with("LP_"))
FGL4_PLEC_train_CV <- select(FGL4_train_CV, -starts_with("PL_"), - starts_with("LP_"))
smote_PLEC_train <-  select(train_smote, -starts_with("PL_"), - starts_with("LP_"))

Step 2. Function to calculate AUC as an evaluation metric, get best threshold and plot ROC curves

# Load workspace from previous step
load(".RData")
library(keras)
#install_keras()
library(pROC)

kerasAUC <- function(y_true, y_pred){
    true= k_flatten(y_true)
    pred = k_flatten(y_pred)
  
    #total number of elements in this batch
    totalCount = k_shape(true)[1]
  
    #sorting the prediction values in descending order
    values = tensorflow::tf$nn$top_k(pred,k=totalCount)
    indices<-values[[1]]
    values<-values[[0]]
  
    #sorting the ground truth values based on the predictions above         
    sortedTrue = k_gather(true, indices)
  
    #getting the ground negative elements (already sorted above)
    negatives = 1 - sortedTrue
  
    #the true positive count per threshold
    TPCurve = k_cumsum(sortedTrue)
  
    #area under the curve
    auc = k_sum(TPCurve * negatives)
  
    #normalizing the result between 0 and 1
    totalCount = k_cast(totalCount, k_floatx())
    positiveCount = k_sum(true)
    negativeCount = totalCount - positiveCount
    totalArea = positiveCount * negativeCount
    return  (auc / totalArea)
}

ROC_threshold_plots_tables <- function(truth_train, prediction_train, 
                                       truth_test, prediction_test, 
                                       threshold = 0, pos = "Pulsar", neg = "Non-Pulsar", print_table=TRUE) {
        if (threshold == 0) {
                # Compute best threshold if none is provided
                ROC <- roc(truth_train, prediction_train, levels = c(pos,neg))
                ROC_table <- cbind(ROC$thresholds, ROC$specificities, ROC$sensitivities) 
                best_threshold <- ROC_table[which.max(ROC_table[, 2] + ROC_table[, 3]), ]
        } else {
                # Don't compute threshold if one is provided 
                best_threshold <- threshold
        }
        if (nargs() > 2) {
                # Make ROC plots
                par(mfrow = c(1, 2))
                ROC_train <- roc(truth_train, prediction_train, levels = c(pos,neg),
                        plot = TRUE, print.auc = TRUE, main = "ROC Train", print.thres = "best") 
                ROC_test <- roc(truth_test, prediction_test, levels = c(pos,neg),
                        plot = TRUE, print.auc = TRUE, main = "ROC Test", print.thres = "best")
                
                if (print_table) {
                        # Generate contingency tables
                        # Train data
                        nrow_train <- length(prediction_train)
                        Predict_class_train <- rep("NA", nrow_train)
                        Predict_class_train <- ifelse(prediction_train > best_threshold[1], pos, neg) 
                        real_category <- truth_train
                        print(table(Predict_class_train, real_category))
                        
                        # Test data
                        nrow_test <- length(prediction_test)
                        Predict_class_test <- rep("NA", nrow_test)
                        Predict_class_test <- ifelse(prediction_test > best_threshold[1], pos, neg) 
                        real_category <- truth_test
                        print(table(Predict_class_test, real_category))
                }
        } 
        return(best_threshold)
}

# Save current workspace for subsequent steps
save.image()

Step 3. AGN vs PSR using Boosted Logistic Regression

library(RWeka) 
library(party)

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## 
## Attaching package: 'modeltools'

## The following object is masked from 'package:keras':
## 
##     fit

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

##### ##### ##### ##### ##### ##### ##### ##### ##### 
# Choose the group of variable using PLEC spectrum type

Model <- LogitBoost(pulsarness ~., data = FGL4_PLEC_train) 
# summary(Model)

# Model predictions
predictions_FGL4_train <- predict(Model, newdata = FGL4_PLEC_train, type = "probability")[, 2]
predictions_FGL4_test <- predict(Model, newdata = FGL4_PLEC_test, type = "probability")[, 2]
# Generate ROC plots and print contingency tables
Best_threshold_train <- ROC_threshold_plots_tables(FGL4_PLEC_train$pulsarness, 
                                                   predictions_FGL4_train,
                                                   FGL4_PLEC_test$pulsarness,
                                                   predictions_FGL4_test)

## Setting direction: controls > cases

## Setting direction: controls > cases
## Setting direction: controls > cases

##                    real_category
## Predict_class_train Non-Pulsar Pulsar
##          Non-Pulsar       2178      7
##          Pulsar             98    178
##                   real_category
## Predict_class_test Non-Pulsar Pulsar
##         Non-Pulsar        950      4
##         Pulsar             42     59

FGL4_BLR_PLEC_test <- ifelse (predictions_FGL4_test > Best_threshold_train[1], 
                             "Pulsar", "Non-Pulsar")
cm_BLR_PLEC<- confusionMatrix(factor(FGL4_BLR_PLEC_test), 
                            factor(FGL4_PLEC_test$pulsarness), positive="Pulsar")
print(cm_BLR_PLEC)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Non-Pulsar Pulsar
##   Non-Pulsar        950      4
##   Pulsar             42     59
##                                           
##                Accuracy : 0.9564          
##                  95% CI : (0.9423, 0.9679)
##     No Information Rate : 0.9403          
##     P-Value [Acc > NIR] : 0.01312         
##                                           
##                   Kappa : 0.6972          
##                                           
##  Mcnemar's Test P-Value : 4.888e-08       
##                                           
##             Sensitivity : 0.93651         
##             Specificity : 0.95766         
##          Pos Pred Value : 0.58416         
##          Neg Pred Value : 0.99581         
##              Prevalence : 0.05972         
##          Detection Rate : 0.05592         
##    Detection Prevalence : 0.09573         
##       Balanced Accuracy : 0.94708         
##                                           
##        'Positive' Class : Pulsar          
##

# Add LR Prediction Probabilities to FGL4_results data frame: 
FGL4_results$BLR_P_PLEC <- round(predict(Model, newdata = FGL4_tidy, type = "probability")[, 2], digits = 3)
# Add LR Prediction category to FGL4_results data frame:
FGL4_results$BLR_Pred_PLEC <- ifelse(FGL4_results$BLR_P_PLEC > Best_threshold_train[1],
                               "PSR", "AGN")

##### ##### ##### ##### ##### ##### ##### ##### ##### 
# Choose the group of variable using LP spectrum type

Model <- LogitBoost(pulsarness ~., data = FGL4_LP_train) 
# summary(Model)

# Model predictions
predictions_FGL4_train <- predict(Model, newdata = FGL4_LP_train, type = "probability")[, 2]
predictions_FGL4_test <- predict(Model, newdata = FGL4_LP_test, type = "probability")[, 2]
# Generate ROC plots and print contingency tables
Best_threshold_train <- ROC_threshold_plots_tables(FGL4_LP_train$pulsarness, 
                                                   predictions_FGL4_train,
                                                   FGL4_LP_test$pulsarness,
                                                   predictions_FGL4_test)

## Setting direction: controls > cases
## Setting direction: controls > cases
## Setting direction: controls > cases

##                    real_category
## Predict_class_train Non-Pulsar Pulsar
##          Non-Pulsar       2215     10
##          Pulsar             61    175
##                   real_category
## Predict_class_test Non-Pulsar Pulsar
##         Non-Pulsar        968      5
##         Pulsar             24     58

FGL4_BLR_LP_test <- ifelse (predictions_FGL4_test > Best_threshold_train[1], 
                             "Pulsar", "Non-Pulsar")
cm_BLR_LP<- confusionMatrix(factor(FGL4_BLR_LP_test), 
                            factor(FGL4_LP_test$pulsarness), positive="Pulsar")
print(cm_BLR_LP)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Non-Pulsar Pulsar
##   Non-Pulsar        968      5
##   Pulsar             24     58
##                                           
##                Accuracy : 0.9725          
##                  95% CI : (0.9608, 0.9815)
##     No Information Rate : 0.9403          
##     P-Value [Acc > NIR] : 7.519e-07       
##                                           
##                   Kappa : 0.7855          
##                                           
##  Mcnemar's Test P-Value : 0.0008302       
##                                           
##             Sensitivity : 0.92063         
##             Specificity : 0.97581         
##          Pos Pred Value : 0.70732         
##          Neg Pred Value : 0.99486         
##              Prevalence : 0.05972         
##          Detection Rate : 0.05498         
##    Detection Prevalence : 0.07773         
##       Balanced Accuracy : 0.94822         
##                                           
##        'Positive' Class : Pulsar          
##

# Add LR Prediction Probabilities to FGL4_results data frame: 
FGL4_results$BLR_P_LP <- round(predict(Model, newdata = FGL4_tidy, type = "probability")[, 2], digits = 3)
# Add LR Prediction category to FGL4_results data frame:
FGL4_results$BLR_Pred_LP <- ifelse(FGL4_results$BLR_P_LP > Best_threshold_train[1],
                               "PSR", "AGN")

# Clean up and Set aside required data sets
Environ <- ls()
Environ <- Environ[Environ != "FGL4_tidy"
                   & Environ != "train_idx"
                   & Environ != "FGL4_AGNPSR"
                   & Environ != "FGL4_results"
                   & Environ != "FGL4_test"
                   & Environ != "FGL4_train"
                   & Environ != "FGL4_LP_train"
                   & Environ != "FGL4_LP_test"
                   & Environ != "FGL4_PLEC_train"
                   & Environ != "FGL4_PLEC_test"
                   & Environ != "smote_PLEC_train"
                   & Environ != "smote_LP_train"
                   & Environ != "predictions_FGL4_train_CV" & Environ != "Block_index"
                   & Environ != "FGL4_train_CV"
                   & Environ != "FGL4_LP_train_CV"
                   & Environ != "FGL4_PLEC_train_CV"
                   & Environ != "ROC_threshold_plots_tables" & Environ != "FGL4_Pulsars_train"
                   & Environ != "FGL4_Pulsars_test"]
rm(list = Environ)
# Save current workspace for subsequent steps
save.image()

Step 4. AGN vs PSR classification using Random Forests (RF)

# AGN vs PSR classification using Random Forests (RF) # Load workspace from previous step
# Load randomForest package
library(randomForest) 
set.seed(2)

##### ##### ##### ##### ##### ##### ##### ##### ##### 
# Choose the group of variable using LP spectrum type

rf.full.FGL4.LP <- randomForest(pulsarness~., data = smote_LP_train, importance = TRUE) 
# The importance of each variable
importance(rf.full.FGL4.LP)

##                     Non-Pulsar   Pulsar MeanDecreaseAccuracy MeanDecreaseGini
## LP_Index             17.803824 24.27911             26.29255         31.77668
## Variability_Index    17.735934 20.67549             25.75509        129.89654
## Frac_Variability     24.238532 33.53784             40.29818        280.52349
## Pivot_Energy         17.970767 20.48269             25.24394         38.12102
## LP_Flux_Density      12.060297 12.63981             16.24734         89.73547
## Unc_PL_Flux_Density   9.453351 12.70249             15.21665         30.09057
## LP_beta              27.699408 42.70506             47.63881        455.63003
## LP_SigCurv           26.539370 28.33617             35.53527        541.69987
## Npred                21.090354 21.85144             26.98389        171.66766
## Unc_Energy_Flux100   37.958736 30.52581             47.13364        134.95325
## hr12                 15.714645 15.91438             19.58856         24.46861
## hr23                 13.256833 15.85425             18.52541         25.03761
## hr34                 14.995749 13.47151             16.71457         20.11207
## hr45                 17.280349 25.57382             26.88047        204.35840
## hr56                 14.120659 17.52732             20.03283         15.08260
## Signif               21.734273 21.41250             25.81852        115.14791

varImpPlot(rf.full.FGL4.LP)

# Prediction for train and test
predictions_FGL4_LP_train <- predict(rf.full.FGL4.LP, newdata = smote_LP_train, type = "Prob")[, 2] 
predictions_FGL4_LP_test <- predict(rf.full.FGL4.LP, newdata = FGL4_LP_test, type = "Prob")[,2 ]

# Generate ROC plots, and print contingency tables
Best_threshold_train <- ROC_threshold_plots_tables(smote_LP_train$pulsarness,
                                                   predictions_FGL4_LP_train, 
                           FGL4_test$pulsarness, 
                           predictions_FGL4_LP_test)

##                    real_category
## Predict_class_train Non-Pulsar Pulsar
##          Non-Pulsar       2220      0
##          Pulsar              0   2405
##                   real_category
## Predict_class_test Non-Pulsar Pulsar
##         Non-Pulsar        986      9
##         Pulsar              6     54

cat("Best threshold from RF:", Best_threshold_train)

## Best threshold from RF: 0.558 1 1

# Get the confusion matrix and statistics
predictions_FGL4_LP_test <- ifelse (predictions_FGL4_LP_test > Best_threshold_train[1], "Pulsar", "Non-Pulsar")
cm_RF_LP<- confusionMatrix(factor(predictions_FGL4_LP_test), factor(FGL4_PLEC_test$pulsarness), positive="Pulsar")
print(cm_RF_LP)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Non-Pulsar Pulsar
##   Non-Pulsar        986      9
##   Pulsar              6     54
##                                          
##                Accuracy : 0.9858         
##                  95% CI : (0.9767, 0.992)
##     No Information Rate : 0.9403         
##     P-Value [Acc > NIR] : 1.346e-13      
##                                          
##                   Kappa : 0.8705         
##                                          
##  Mcnemar's Test P-Value : 0.6056         
##                                          
##             Sensitivity : 0.85714        
##             Specificity : 0.99395        
##          Pos Pred Value : 0.90000        
##          Neg Pred Value : 0.99095        
##              Prevalence : 0.05972        
##          Detection Rate : 0.05118        
##    Detection Prevalence : 0.05687        
##       Balanced Accuracy : 0.92555        
##                                          
##        'Positive' Class : Pulsar         
##

# Add RF Prediction probabilities to FGL4_results
FGL4_results$RF_P_LP <- predict(rf.full.FGL4.LP, newdata = FGL4_tidy, type = "Prob")[,2 ] 

# Add RF Prediction category to FGL4_results
FGL4_results$RF_Pred_LP <- ifelse(FGL4_results$RF_P_LP > Best_threshold_train[1],
                               "PSR", "AGN")

#par(mfrow = c(1,1))
#varImpPlot(rf.full.FGL4.LP, pch = 19, type = 1, main = "")

##### ##### ##### ##### ##### ##### ##### ##### ##### 
# Choose the group of variable using PLEC spectrum type
# Now Modeling using all the data
rf.full.FGL4.PLEC <- randomForest(pulsarness~., data = smote_PLEC_train, importance = TRUE) 
# The importance of each variable
importance(rf.full.FGL4.PLEC)

##                     Non-Pulsar    Pulsar MeanDecreaseAccuracy MeanDecreaseGini
## PLEC_Index           18.944722 19.125709            23.131858      168.0759139
## Variability_Index    18.821615 20.099989            25.781404      114.0761693
## Frac_Variability     26.748593 33.792278            39.816004      281.0334698
## Pivot_Energy         19.933050 21.047393            27.770332       40.2494910
## PLEC_Flux_Density    12.403828 12.086922            15.073942       80.3798862
## Unc_PL_Flux_Density  10.079361 14.238939            16.060753       28.7315378
## PLEC_Expfactor       24.672511 30.671496            35.778698      397.5668142
## PLEC_Exp_Index        2.667066  2.722971             3.146663        0.6004715
## PLEC_SigCurv         24.310805 27.243626            33.856888      512.0953220
## Npred                23.263048 22.711006            28.899681      176.1675843
## Unc_Energy_Flux100   41.585116 37.022316            49.727464      139.0983600
## hr12                 16.262667 16.649248            21.618039       22.5444622
## hr23                 12.914681 16.077467            18.384346       24.9949436
## hr34                 16.585442 14.704431            18.346679       22.6165609
## hr45                 19.967886 23.112523            24.732052      190.3019048
## hr56                 14.805873 17.254664            20.208062       15.5765123
## Signif               23.849538 20.910192            26.168898       93.9285501

varImpPlot(rf.full.FGL4.PLEC)

# Prediction for train and test
predictions_FGL4_PLEC_train <- predict(rf.full.FGL4.PLEC, newdata = smote_PLEC_train, type = "Prob")[, 2] 
predictions_FGL4_PLEC_test <- predict(rf.full.FGL4.PLEC, newdata = FGL4_PLEC_test, type = "Prob")[,2 ]

# Generate ROC plots, and print contingency tables
Best_threshold_train <- ROC_threshold_plots_tables(smote_PLEC_train$pulsarness,
                                                   predictions_FGL4_LP_train, 
                           FGL4_test$pulsarness, 
                           predictions_FGL4_PLEC_test)

##                    real_category
## Predict_class_train Non-Pulsar Pulsar
##          Non-Pulsar       2220      0
##          Pulsar              0   2405
##                   real_category
## Predict_class_test Non-Pulsar Pulsar
##         Non-Pulsar        985      9
##         Pulsar              7     54

cat("Best threshold from RF:", Best_threshold_train)

## Best threshold from RF: 0.558 1 1

# Get the confusion matrix and statistics
predictions_FGL4_PLEC_test <- ifelse (predictions_FGL4_PLEC_test > Best_threshold_train[1], "Pulsar", "Non-Pulsar")
cm_RF_PLEC<- confusionMatrix(factor(predictions_FGL4_LP_test), factor(FGL4_PLEC_test$pulsarness), positive="Pulsar")
print(cm_RF_PLEC)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Non-Pulsar Pulsar
##   Non-Pulsar        986      9
##   Pulsar              6     54
##                                          
##                Accuracy : 0.9858         
##                  95% CI : (0.9767, 0.992)
##     No Information Rate : 0.9403         
##     P-Value [Acc > NIR] : 1.346e-13      
##                                          
##                   Kappa : 0.8705         
##                                          
##  Mcnemar's Test P-Value : 0.6056         
##                                          
##             Sensitivity : 0.85714        
##             Specificity : 0.99395        
##          Pos Pred Value : 0.90000        
##          Neg Pred Value : 0.99095        
##              Prevalence : 0.05972        
##          Detection Rate : 0.05118        
##    Detection Prevalence : 0.05687        
##       Balanced Accuracy : 0.92555        
##                                          
##        'Positive' Class : Pulsar         
##

# Add RF Prediction probabilities to FGL4_results
FGL4_results$RF_P_LP <- predict(rf.full.FGL4.LP, newdata = FGL4_tidy, type = "Prob")[,2 ] 

# Add RF Prediction category to FGL4_results
FGL4_results$RF_Pred_LP <- ifelse(FGL4_results$RF_P_LP > Best_threshold_train[1],
                               "PSR", "AGN")

#par(mfrow = c(1,1))
#varImpPlot(rf.full.FGL4.PLEC, pch = 19, type = 1, main = "")

Step 5. Outlyingness

# Computation of AGN and PSR "outlyingness" using Random Forests (RF) 
# Load workspace from previous step
# load(".RData")
# Load randomForest package
library(randomForest) 

# Set random seed
set.seed(1)

# Compute RF predictions on entire FGL4 data
results <- predict(rf.full.FGL4.PLEC, newdata = FGL4_tidy, proximity = TRUE, type = "Prob")

# Use proximity matrix to compute PSR and AGN "outlyingness"
nobs <- dim(FGL4_tidy)[1] 
PSR_Out <- numeric(nobs) 
AGN_Out <- numeric(nobs)
for (i in 1:nobs ) {
        PSR_proximities <- results$proximity[FGL4_tidy$pulsarness == "Pulsar", i] 
        PSR_Out[i] <- 1./sum(PSR_proximities^2)
        AGN_proximities <- results$proximity[FGL4_tidy$agnness == "AGN", i] 
        AGN_Out[i] <- 1./sum(AGN_proximities^2)
}

# Normalize outlyingness
PSR_Out_norm <- (PSR_Out - median(PSR_Out[FGL4_tidy$pulsarness == "Pulsar"])) / 
        mad(PSR_Out[FGL4_tidy$pulsarness=="Pulsar"])
AGN_Out_norm <- (AGN_Out - median(AGN_Out[FGL4_tidy$agnness == "AGN"])) / 
        mad(AGN_Out[FGL4_tidy$agnness=="AGN"])

# Add outlyingness to FGL3_results data frame 
FGL4_results$PSR_Out <- round(PSR_Out_norm, digits = 3) 
FGL4_results$AGN_Out <- round(AGN_Out_norm, digits = 3)

plot(FGL4_results$PSR_Out, FGL4_results$AGN_Out, 
     xlim = c(0.1,900), ylim = c(0.1,3000), 
     xlab = "PSR Outlyingness", 
     ylab = "AGN Outlyingness", 
     log = "xy",
     type = "n")

legend(0.1, 6, c("AGN", "Pulsars", "Non-AGN/Non-Pulsars"), pch = c(17, 19, 1))
points(FGL4_results$PSR_Out[FGL4_tidy$agnness=="AGN"], 
       FGL4_results$AGN_Out[FGL4_tidy$agnness=="AGN"], 
       pch = 17)
points(FGL4_results$PSR_Out[FGL4_tidy$pulsarness=="Pulsar"], 
       FGL4_results$AGN_Out[FGL4_tidy$pulsarness=="Pulsar"], 
       pch = 19)
points(FGL4_results$PSR_Out[(FGL4_tidy$agnness!="AGN")&(FGL4_tidy$pulsarness!="Pulsar")], 
       pch = 1)
abline(h = 10, lty = 2, lwd = 3)
abline(v = 10, lty = 2, lwd = 3)

# Clean up and Set aside required data sets are not shown

Step 6. AGN vs PSR classification using XGBoost

# AGN vs PSR classification using XGBoost
# Load workspace from previous step
# load(".RData")
# Load XGBoost package
library(xgboost) 
library(caret)
# require(e1071) # require downloading for the first time
# require(Ckmeans.1d.dp) # require downloading for the first time

pulsarness_labels <- select(FGL4_AGNPSR, pulsarness)
#convert labels to pulsar = 1, non-pulsar = 0
pulsarness_labels$pulsarness <- as.numeric(pulsarness_labels$pulsarness=="Pulsar") 
FGL4_AGNPSR_test_labels <- as.matrix(pulsarness_labels[-train_idx,])

##### ##### ##### ##### ##### ##### ##### ##### ##### 
# select the group of variable using PLEC spectrum type
FGL4_AGNPSR_PLEC <-  select(FGL4_AGNPSR, -starts_with("PL_"), - starts_with("LP_"), -pulsarness)
FGL4_AGNPSR_PLEC_train <- as.matrix(FGL4_AGNPSR_PLEC[train_idx, ])
FGL4_AGNPSR_PLEC_test <- as.matrix(FGL4_AGNPSR_PLEC[-train_idx, ])
# put testing & training data into two seperates Dmatrixs objects
dtest_PLEC <- xgb.DMatrix(data = FGL4_AGNPSR_PLEC_test, label= FGL4_AGNPSR_test_labels)

##### ##### ##### ##### ##### ##### ##### ##### ##### 
# select the group of variable using LP spectrum type
FGL4_AGNPSR_LP <-  select(FGL4_AGNPSR, -starts_with("PLEC_"), - starts_with("PL_"), -pulsarness)
FGL4_AGNPSR_LP_test <- as.matrix(FGL4_AGNPSR_LP[-train_idx, ])
FGL4_AGNPSR_LP_train <- as.matrix(FGL4_AGNPSR_LP[train_idx, ])
# put testing & training data into two seperates Dmatrixs objects
dtest_LP <- xgb.DMatrix(data = FGL4_AGNPSR_LP_test, label= FGL4_AGNPSR_test_labels)

##### ##### ##### ##### ##### ##### ##### ##### ##### 
# Use SMOTE
smote_AGNPSR_train_labels <- as.matrix(as.numeric(smote_PLEC_train$pulsarness=="Pulsar"))
agn_cases <- sum(smote_AGNPSR_train_labels == 0)
psr_cases <- sum(smote_AGNPSR_train_labels == 1)

smote_AGNPSR_PLEC_train <- as.matrix(select(smote_PLEC_train, -pulsarness))
dtrain_PLEC <- xgb.DMatrix(data = smote_AGNPSR_PLEC_train, label= smote_AGNPSR_train_labels)

smote_AGNPSR_LP_train <- as.matrix(select(smote_LP_train, -pulsarness))
dtrain_LP <- xgb.DMatrix(data = smote_AGNPSR_LP_train, label= smote_AGNPSR_train_labels)

########### Hyper-Parameter Tuning ##########
# default parameters to find best no. of iteration
params <- list(booster = "gbtree", 
               objective = "binary:logistic", 
               eval_metric = "auc", # "error", 
               eta=0.1, max_depth=6, min_child_weight=1, gamma=1, lambda = 0,
               # scale_pos_weight = agn_cases /  psr_cases, # control for imbalanced classes
               maximize = F , 
               subsample=1, 
               colsample_bytree=1
               )

# Using the inbuilt xgb.cv function to calculate the best nround for this model. 
# In addition, this function also returns CV error, which is an estimate of test error.
xgbcv_PLEC <- xgb.cv( params = params, data = dtrain_PLEC, 
                 nrounds = 100, nfold = 10, showsd = T,
                 print_every_n = 20 ,  stratified = T, early_stopping_rounds = 20, 
                 )

## [1]  train-auc:0.997505+0.000488 test-auc:0.990877+0.005510 
## Multiple eval metrics are present. Will use test_auc for early stopping.
## Will train until test_auc hasn't improved in 20 rounds.
## 
## [21] train-auc:0.999646+0.000204 test-auc:0.997716+0.001608 
## [41] train-auc:0.999984+0.000018 test-auc:0.998640+0.001147 
## [61] train-auc:0.999999+0.000002 test-auc:0.999278+0.000396 
## [81] train-auc:1.000000+0.000000 test-auc:0.999328+0.000397 
## Stopping. Best iteration:
## [72] train-auc:1.000000+0.000000 test-auc:0.999336+0.000394

best_iter <- 100
if (!is.null(xgbcv_PLEC$best_iteration )) {best_iter <- xgbcv_PLEC$best_iteration}
print(paste("best_iteration PLEC:", best_iter))

## [1] "best_iteration PLEC: 72"

# Gradient boost model training
xgb1_PLEC <- xgb.train (params = params, 
                   data = dtrain_PLEC, 
                   nrounds = best_iter, 
                   # early_stopping_rounds = 20, 
                   print_every_n = 20 ,
                   watchlist = list(val=dtest_PLEC, train=dtrain_PLEC)
                   )

## [1]  val-auc:0.958957    train-auc:0.997208 
## [21] val-auc:0.990751    train-auc:0.999627 
## [41] val-auc:0.990591    train-auc:0.999993 
## [61] val-auc:0.990575    train-auc:1.000000 
## [72] val-auc:0.991263    train-auc:1.000000

xgb.save(xgb1_PLEC,'xgb_PLEC.model')

## [1] TRUE

# xgb1_PLEC <- xgb.load('xgb_PLEC.model')

# Using package 'mlr to search for best parameter set
#convert characters to factors
library(mlr)
#create tasks
traintask <- makeClassifTask (data = FGL4_PLEC_train, target = "pulsarness")
testtask <- makeClassifTask (data = FGL4_PLEC_test, target = "pulsarness")

#create learner
lrn <- makeLearner("classif.xgboost", predict.type = "response")
lrn$par.vals <- list( objective="binary:logistic", eval_metric="auc", 
                      nrounds=best_iter, # early_stopping_rounds = 40, 
                      # scale_pos_weight = agn_cases /  psr_cases, 
                      print_every_n = 20)

#set parameter space
params_set <- makeParamSet( makeDiscreteParam("booster",values = c("gbtree")),
                            makeIntegerParam("max_depth", lower = 3L, upper = 10L), 
                            makeNumericParam("min_child_weight", lower = 1L, upper = 10L),
                            makeNumericParam("subsample", lower = 0.7, upper = 1), 
                            makeNumericParam("colsample_bytree", lower = 0.5, upper = 1),
                            makeNumericParam("lambda", lower=0.75, upper=1.25),
                            makeNumericParam("gamma", lower=0, upper=1.5),
                            makeNumericParam("eta", lower = 0.01, upper=0.3))

#set resampling strategy
rdesc <- makeResampleDesc("CV", stratify = T, iters=10L)

#search strategy
ctrl <- makeTuneControlRandom(maxit = 10L)

# Set a parallel backend to ensure faster computation. 
# Make sure not to open several applications in backend. 

library(parallel)
library(parallelMap) 
#parallelStartSocket(cpus = 4) # detectCores()) # does not work in R 4.x
#parallelStop() 
#parameter tuning
mytune <- tuneParams(learner = lrn, task = traintask, 
                     resampling = rdesc, 
                     measures = acc, par.set = params_set, 
                     control = ctrl, show.info = T)
# Save a single object to a file
saveRDS(mytune, "mytune.rds")

# set hyperparameters
# lrn_tune <- setHyperPars(lrn, par.vals = mytune$x)
# train model
# xgmodel <- train(learner = lrn_tune, task = traintask)
# predict model
# xgpred <- predict(xgmodel, testtask)

# Restore mytune
#mytune <- readRDS("mytune.rds")
#mytune$x
set.seed(3)
params <- list(booster = "gbtree", 
               objective = "binary:logistic", 
               eval_metric = "auc",
               eta = 0.05, 
               max_depth = 6, 
               min_child_weight = 8, 
               lambda = 1.1,
               gamma = 1, # add a regularization term
               # scale_pos_weight = agn_cases /  psr_cases, # control for imbalanced classes
               subsample = 0.7, 
               colsample_bytree = 0.9)

# Gradient boost model training
xgb1_PLEC <- xgb.train (params = params, 
                   data = dtrain_PLEC, 
                   nrounds = 500, 
                   watchlist = list(val=dtest_PLEC, train=dtrain_PLEC), 
                   print_every_n = 10, 
                   early_stopping_rounds = 40, 
                   maximize = TRUE 
                   )
xgb.save(xgb1_PLEC,'xgb_PLEC.model')

xgb1_PLEC<- xgb.load('xgb_PLEC.model')
# model prediction
xgbpred_PLEC_test <- predict (xgb1_PLEC, dtest_PLEC)
xgbpred_PLEC_train <- predict (xgb1_PLEC, dtrain_PLEC)
# xgbpred1 <- ifelse (xgbpred1 > 0.5,1,0)
# err <- mean(as.numeric(xgbpred1 > 0.5) != FGL4_AGNPSR_test_labels)
# print(paste("model 1 test-error=", err)) 

# Get Training Set threshold, generate ROC plots, and print contingency tables
Best_threshold_train_PLEC <- ROC_threshold_plots_tables(smote_AGNPSR_train_labels, #FGL4_AGNPSR_train_labels, 
                                                   xgbpred_PLEC_train,
                                                   FGL4_AGNPSR_test_labels, 
                                                   xgbpred_PLEC_test, pos="1", neg = "0")

##                    real_category
## Predict_class_train    0    1
##                   0 2220    0
##                   1    0 2405
##                   real_category
## Predict_class_test   0   1
##                  0 984   8
##                  1   8  55

#Predict_class_test   0   1
#                 0 982   6
 #                1  10  57
cat("Best threshold from xgBoost:", Best_threshold_train_PLEC)

## Best threshold from xgBoost: 0.5802544 1 1

# Get the confusion matrix and statistics
xgbpred_PLEC_test <- ifelse (xgbpred_PLEC_test > Best_threshold_train_PLEC[1],1,0)
cm_xgb_PLEC<- confusionMatrix(factor(xgbpred_PLEC_test), factor(FGL4_AGNPSR_test_labels), positive="1")
print(cm_xgb_PLEC)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 984   8
##          1   8  55
##                                           
##                Accuracy : 0.9848          
##                  95% CI : (0.9755, 0.9913)
##     No Information Rate : 0.9403          
##     P-Value [Acc > NIR] : 5.666e-13       
##                                           
##                   Kappa : 0.865           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.87302         
##             Specificity : 0.99194         
##          Pos Pred Value : 0.87302         
##          Neg Pred Value : 0.99194         
##              Prevalence : 0.05972         
##          Detection Rate : 0.05213         
##    Detection Prevalence : 0.05972         
##       Balanced Accuracy : 0.93248         
##                                           
##        'Positive' Class : 1               
##

# Add XGBoost Prediction probabilities to FGL4_results
FGL4_tidy_num <- select(FGL4_tidy, -CLASS1, -ASSOC1, -ASSOC2, -CLASS2, 
                      -SpectrumType, -Source_Name, -ASSOC_FGL, -GLAT, -Conf_95_PosAng, -agnness)
FGL4_tidy_num_PLEC <- select(FGL4_tidy_num, -starts_with("PL_"), - starts_with("LP_"), -pulsarness)
FGL4_tidy_num_PLEC_labels <- as.matrix(rep(0,nrow(FGL4_tidy_num_PLEC)))
dFGL4_tidy_PLEC <- xgb.DMatrix(data = as.matrix(as.data.frame(FGL4_tidy_num_PLEC)), 
                               label= FGL4_tidy_num_PLEC_labels)

FGL4_results$XGB_P_PLEC <- predict(xgb1_PLEC, newdata = dFGL4_tidy_PLEC, type = "Prob")
FGL4_results$XGB_Pred_PLEC <- ifelse (FGL4_results$XGB_P_PLEC > Best_threshold_train_PLEC[1],c("PSR"), c("AGN"))

# Plot 
par(mfrow = c(1,1))

#view variable importance plot
mat <- xgb.importance (feature_names = names(FGL4_AGNPSR_PLEC), model = xgb1_PLEC)
xgb.ggplot.importance(importance_matrix = mat)

xgb1_LP<- xgb.load('xgb_LP.model')
# model prediction
xgbpred_LP_test <- predict (xgb1_LP, dtest_LP)
xgbpred_LP_train <- predict (xgb1_LP, dtrain_LP)
# xgbpred1 <- ifelse (xgbpred1 > 0.5,1,0)
# err <- mean(as.numeric(xgbpred1 > 0.5) != FGL4_AGNPSR_test_labels)
# print(paste("model 1 test-error=", err)) 

# Get Training Set threshold, generate ROC plots, and print contingency tables
Best_threshold_train_LP <- ROC_threshold_plots_tables(smote_AGNPSR_train_labels, #FGL4_AGNPSR_train_labels, 
                                                   xgbpred_LP_train,
                                                   FGL4_AGNPSR_test_labels, 
                                                   xgbpred_LP_test, pos="1", neg = "0")

##                    real_category
## Predict_class_train    0    1
##                   0 2214    3
##                   1    6 2402
##                   real_category
## Predict_class_test   0   1
##                  0 981   6
##                  1  11  57

cat("Best threshold from xgBoost for LP:", Best_threshold_train_LP)

## Best threshold from xgBoost for LP: 0.578409 0.9987526 0.9972973

# Get the confusion matrix and statistics
xgbpred_LP_test <- ifelse (xgbpred_LP_test > Best_threshold_train_LP[1],1,0)
cm_xgb_LP<- confusionMatrix(factor(xgbpred_LP_test), factor(FGL4_AGNPSR_test_labels), positive="1")
print(cm_xgb_LP)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 981   6
##          1  11  57
##                                           
##                Accuracy : 0.9839          
##                  95% CI : (0.9743, 0.9906)
##     No Information Rate : 0.9403          
##     P-Value [Acc > NIR] : 2.244e-12       
##                                           
##                   Kappa : 0.8617          
##                                           
##  Mcnemar's Test P-Value : 0.332           
##                                           
##             Sensitivity : 0.90476         
##             Specificity : 0.98891         
##          Pos Pred Value : 0.83824         
##          Neg Pred Value : 0.99392         
##              Prevalence : 0.05972         
##          Detection Rate : 0.05403         
##    Detection Prevalence : 0.06445         
##       Balanced Accuracy : 0.94684         
##                                           
##        'Positive' Class : 1               
##

# Add XGBoost Prediction probabilities to FGL4_results
FGL4_tidy_num_LP <- select(FGL4_tidy_num, -starts_with("PL_"), - starts_with("PLEC_"), -pulsarness)
FGL4_tidy_num_LP_labels <- as.matrix(rep(0,nrow(FGL4_tidy_num_LP)))
dFGL4_tidy_LP <- xgb.DMatrix(data = as.matrix(as.data.frame(FGL4_tidy_num_LP)), 
                               label= FGL4_tidy_num_LP_labels)

FGL4_results$XGB_P_LP <- predict(xgb1_PLEC, newdata = dFGL4_tidy_LP, type = "Prob")
FGL4_results$XGB_Pred_LP <- ifelse (FGL4_results$XGB_P_LP > Best_threshold_train_LP[1],c("PSR"), c("AGN"))

# Plot 
par(mfrow = c(1,1))

#view variable importance plot
mat <- xgb.importance (feature_names = names(FGL4_AGNPSR_LP), model = xgb1_LP)
xgb.ggplot.importance(importance_matrix = mat)

save(FGL4_results, file = "FGL4_results.rData", compress = "xz")