Classification on 4FGL (Part2)

Note that the R library keras conflicts with other R libraries. Better remove other unnecessary R packages for this script.

Step 2 (as in part 1). Function to calculate AUC as an evaluation metric, get best threshold and plot ROC curves

# Load workspace from previous step
library(keras)
#install_keras()
library(pROC)
load("FGL4_results.rData")
load(".RData")
kerasAUC <- function(y_true, y_pred){
    true= k_flatten(y_true)
    pred = k_flatten(y_pred)
  
    #total number of elements in this batch
    totalCount = k_shape(true)[1]
  
    #sorting the prediction values in descending order
    values = tensorflow::tf$nn$top_k(pred,k=totalCount)
    indices<-values[[1]]
    values<-values[[0]]
  
    #sorting the ground truth values based on the predictions above         
    sortedTrue = k_gather(true, indices)
  
    #getting the ground negative elements (already sorted above)
    negatives = 1 - sortedTrue
  
    #the true positive count per threshold
    TPCurve = k_cumsum(sortedTrue)
  
    #area under the curve
    auc = k_sum(TPCurve * negatives)
  
    #normalizing the result between 0 and 1
    totalCount = k_cast(totalCount, k_floatx())
    positiveCount = k_sum(true)
    negativeCount = totalCount - positiveCount
    totalArea = positiveCount * negativeCount
    return  (auc / totalArea)
}

ROC_threshold_plots_tables <- function(truth_train, prediction_train, 
                                       truth_test, prediction_test, 
                                       threshold = 0, pos = "Pulsar", neg = "Non-Pulsar", print_table=TRUE) {
        if (threshold == 0) {
                # Compute best threshold if none is provided
                ROC <- roc(truth_train, prediction_train, levels = c(pos,neg))
                ROC_table <- cbind(ROC$thresholds, ROC$specificities, ROC$sensitivities) 
                best_threshold <- ROC_table[which.max(ROC_table[, 2] + ROC_table[, 3]), ]
        } else {
                # Don't compute threshold if one is provided 
                best_threshold <- threshold
        }
        if (nargs() > 2) {
                # Make ROC plots
                par(mfrow = c(1, 2))
                ROC_train <- roc(truth_train, prediction_train, levels = c(pos,neg),
                        plot = TRUE, print.auc = TRUE, main = "ROC Train", print.thres = "best") 
                ROC_test <- roc(truth_test, prediction_test, levels = c(pos,neg),
                        plot = TRUE, print.auc = TRUE, main = "ROC Test", print.thres = "best")
                
                if (print_table) {
                        # Generate contingency tables
                        # Train data
                        nrow_train <- length(prediction_train)
                        Predict_class_train <- rep("NA", nrow_train)
                        Predict_class_train <- ifelse(prediction_train > best_threshold[1], pos, neg) 
                        real_category <- truth_train
                        print(table(Predict_class_train, real_category))
                        
                        # Test data
                        nrow_test <- length(prediction_test)
                        Predict_class_test <- rep("NA", nrow_test)
                        Predict_class_test <- ifelse(prediction_test > best_threshold[1], pos, neg) 
                        real_category <- truth_test
                        print(table(Predict_class_test, real_category))
                }
        } 
        return(best_threshold)
}

# Save current workspace for subsequent steps
# save.image()

Step 7. AGN vs PSR classification using ANN

# load(".RData")
# library(keras)
y_train <- to_categorical(smote_AGNPSR_train_labels, 2) # agn = 1, pulsar = 2
y_test<- to_categorical(FGL4_AGNPSR_test_labels,2)

x_train <- as.data.frame(smote_PLEC_train[, -ncol(smote_PLEC_train)])
x_train <- as.matrix(x_train)

x_test <- as.matrix(FGL4_AGNPSR_PLEC_test)
#head(smote_PLEC_train)
#table(smote_PLEC_train$pulsarness)

# create metric using backend tensor functions
metric_kerasAUC <- custom_metric("kerasAUC", function(y_true, y_pred) {
  kerasAUC(y_true, y_pred)
})
model <- keras_model_sequential() 
model %>% 
  layer_dense(units = 14, # Positive integer, dimensionality of the output space.
              activation = 'relu', bias_regularizer = regularizer_l2(0.005),
              input_shape = c(ncol(smote_PLEC_train)-1)) %>% 
  layer_dropout(rate = 0.3) %>% 
  layer_dense(units = 6, activation = 'relu', bias_regularizer = regularizer_l2(0.01)) %>%
  layer_dropout(rate = 0.2) %>%
  layer_dense(units = 2, activation = 'softmax')

summary(model)

model %>% compile(
  loss = 'categorical_crossentropy',
  optimizer = optimizer_rmsprop(lr = 0.01),
  metrics = c(metric_kerasAUC, 'accuracy')
)

history <- model %>% fit(
  x_train, y_train, 
  epochs = 100, # batch_size = 128, 
  validation_split = 0.2
)

model %>% save_model_hdf5("my_model_best.h5")
y_train_pred <- model %>% predict_classes(x_train)
y_test_pred <- model %>% predict_classes(x_test)
y_train_p <- model %>%  predict_proba(x_train)
y_test_p <- model %>%  predict_proba(x_test)
# score <- model %>% evaluate(x_test, y_test)
# factor(y_test_pred)
Best_threshold_train_PLEC <- ROC_threshold_plots_tables(c(smote_AGNPSR_train_labels), #FGL4_AGNPSR_train_labels, 
                                                        y_train_p[,1],
                                                        c(FGL4_AGNPSR_test_labels), 
                                                        y_test_p[,1], pos=1, neg = 0)

##                    real_category
## Predict_class_train    0    1
##                   0  170 2333
##                   1 2050   72
##                   real_category
## Predict_class_test   0   1
##                  0  58  58
##                  1 934   5
# Get the confusion matrix and statistics
library(caret)
#y_test_pred <- ifelse (y_test_pred > Best_threshold_train_PLEC[1],1,0)
cm_ann_PLEC<- confusionMatrix(factor(y_test_pred), factor(c(FGL4_AGNPSR_test_labels)), positive="1")
print(cm_ann_PLEC)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 938   5
##          1  54  58
##                                           
##                Accuracy : 0.9441          
##                  95% CI : (0.9285, 0.9572)
##     No Information Rate : 0.9403          
##     P-Value [Acc > NIR] : 0.3302          
##                                           
##                   Kappa : 0.635           
##                                           
##  Mcnemar's Test P-Value : 4.129e-10       
##                                           
##             Sensitivity : 0.92063         
##             Specificity : 0.94556         
##          Pos Pred Value : 0.51786         
##          Neg Pred Value : 0.99470         
##              Prevalence : 0.05972         
##          Detection Rate : 0.05498         
##    Detection Prevalence : 0.10616         
##       Balanced Accuracy : 0.93310         
##                                           
##        'Positive' Class : 1               
## 
# Add ANN Prediction probabilities to FGL4_results
FGL4_tidy_num <- select(FGL4_tidy, -CLASS1, -ASSOC1, -ASSOC2, -CLASS2, 
                      -SpectrumType, -Source_Name, -ASSOC_FGL, -GLAT, -Conf_95_PosAng, -agnness)
FGL4_tidy_num_PLEC <- select(FGL4_tidy_num, -starts_with("PL_"), - starts_with("LP_"), -pulsarness)
FGL4_mat <- as.matrix(FGL4_tidy_num_PLEC)
FGL4_results$ANN_Pred_PLEC <- model %>% predict_classes(FGL4_mat)
FGL4_results$ANN_P_PLEC <- model %>%  predict_proba(FGL4_mat)
##### ##### ##### ##### ##### ##### ##### ##### ##### 
# select the group of variable using LP spectrum type
x_train_LP <- as.data.frame(smote_LP_train[, -ncol(smote_LP_train)])
x_train_LP <- as.matrix(x_train_LP)
x_test_LP <- as.matrix(FGL4_AGNPSR_LP_test)
model <- keras_model_sequential() 
model %>% 
  layer_dense(units = 12, # Positive integer, dimensionality of the output space.
              activation = 'relu', bias_regularizer = regularizer_l2(0.005),
              input_shape = c(ncol(smote_LP_train)-1)) %>% 
  layer_dropout(rate = 0.3) %>% 
  layer_dense(units = 6, activation = 'relu', bias_regularizer = regularizer_l2(0.01)) %>%
  layer_dropout(rate = 0.2) %>%
  layer_dense(units = 2, activation = 'softmax')

summary(model)

model %>% compile(
  loss = 'categorical_crossentropy',
  optimizer = optimizer_rmsprop(lr = 0.01),
  metrics = c(metric_kerasAUC, 'accuracy')
)

history <- model %>% fit(
  x_train, y_train, 
  epochs = 100, # batch_size = 128, 
  validation_split = 0.2
)

model %>% save_model_hdf5("my_model_best_LP.h5")
y_train_pred_LP <- model %>% predict_classes(x_train_LP)
y_test_pred_LP <- model %>% predict_classes(x_test_LP)
y_train_p_LP <- model %>%  predict_proba(x_train_LP)
y_test_p_LP <- model %>%  predict_proba(x_test_LP)
#score <- model %>% evaluate(x_test, y_test)
# factor(y_test_pred)
Best_threshold_train_PLEC <- ROC_threshold_plots_tables(c(smote_AGNPSR_train_labels), #FGL4_AGNPSR_train_labels, 
                                                        y_train_p_LP[,1],
                                                        c(FGL4_AGNPSR_test_labels), 
                                                        y_test_p_LP[,1], pos=1, neg = 0)

##                    real_category
## Predict_class_train    0    1
##                   0  234 2343
##                   1 1986   62
##                   real_category
## Predict_class_test   0   1
##                  0 100  62
##                  1 892   1
# Get the confusion matrix and statistics
#y_test_pred <- ifelse (y_test_pred > Best_threshold_train_PLEC[1],1,0)
cm_ann_LP<- confusionMatrix(factor(y_test_pred_LP), factor(c(FGL4_AGNPSR_test_labels)), positive="1")
print(cm_ann_LP)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 891   1
##          1 101  62
##                                           
##                Accuracy : 0.9033          
##                  95% CI : (0.8839, 0.9205)
##     No Information Rate : 0.9403          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.5061          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.98413         
##             Specificity : 0.89819         
##          Pos Pred Value : 0.38037         
##          Neg Pred Value : 0.99888         
##              Prevalence : 0.05972         
##          Detection Rate : 0.05877         
##    Detection Prevalence : 0.15450         
##       Balanced Accuracy : 0.94116         
##                                           
##        'Positive' Class : 1               
## 
# Add ANN Prediction probabilities to FGL4_results
FGL4_tidy_num_LP <- select(FGL4_tidy_num, -starts_with("PL_"), - starts_with("PLEC_"), -pulsarness)
FGL4_mat <- as.matrix(FGL4_tidy_num_LP)
FGL4_results$ANN_Pred_LP <- model %>% predict_classes(FGL4_mat)
FGL4_results$ANN_P_LP <- model %>%  predict_proba(FGL4_mat)

save(FGL4_results, file = "FGL4_results.rData", compress = "xz")

Step 8. AGN vs PSR classification using Gaussian Mixture Model

library(ClusterR)
load("fermicatsR/FGL4_tidy.rData")
library(caret)
library(dplyr)

pulsarness_labels <- select(FGL4_AGNPSR, pulsarness)
#convert labels to pulsar = 1, non-pulsar = 0
pulsarness_labels$pulsarness <- as.numeric(pulsarness_labels$pulsarness=="Pulsar")

##### ##### ##### ##### ##### ##### ##### ##### ##### 
# select the group of variable using PLEC spectrum type
FGL4_AGNPSR_PLEC <-  select(FGL4_AGNPSR, -starts_with("PL_"), - starts_with("LP_"), -pulsarness)
FGL4_AGNPSR_PLEC_mat <- as.matrix(FGL4_AGNPSR_PLEC)
gmm_PLEC = GMM(FGL4_AGNPSR_PLEC_mat, 2, "eucl_dist", "random_subset", 10, 10, verbose = TRUE)
## gmm_diag::learn(): generating initial means
## gmm_diag::learn(): k-means: n_threads: 1
## gmm_diag::learn(): k-means: iteration:    1   delta: 2.18713e+06
## gmm_diag::learn(): k-means: iteration:    2   delta: 389794
## gmm_diag::learn(): k-means: iteration:    3   delta: 198033
## gmm_diag::learn(): k-means: iteration:    4   delta: 125302
## gmm_diag::learn(): k-means: iteration:    5   delta: 84560.1
## gmm_diag::learn(): k-means: iteration:    6   delta: 64403.8
## gmm_diag::learn(): k-means: iteration:    7   delta: 48697.3
## gmm_diag::learn(): k-means: iteration:    8   delta: 41072.8
## gmm_diag::learn(): k-means: iteration:    9   delta: 33630.5
## gmm_diag::learn(): k-means: iteration:   10   delta: 27406.7
## gmm_diag::learn(): generating initial covariances
## gmm_diag::learn(): EM: n_threads: 1
## gmm_diag::learn(): EM: iteration:    1   avg_log_p: -11.64
## gmm_diag::learn(): EM: iteration:    2   avg_log_p: -11.5138
## gmm_diag::learn(): EM: iteration:    3   avg_log_p: -11.4754
## gmm_diag::learn(): EM: iteration:    4   avg_log_p: -11.4581
## gmm_diag::learn(): EM: iteration:    5   avg_log_p: -11.4499
## gmm_diag::learn(): EM: iteration:    6   avg_log_p: -11.4447
## gmm_diag::learn(): EM: iteration:    7   avg_log_p: -11.441
## gmm_diag::learn(): EM: iteration:    8   avg_log_p: -11.4389
## gmm_diag::learn(): EM: iteration:    9   avg_log_p: -11.4374
## gmm_diag::learn(): EM: iteration:   10   avg_log_p: -11.4365
## 
## time to complete : 0.00670757
pr_PLEC = predict_GMM(FGL4_AGNPSR_PLEC_mat, gmm_PLEC$centroids, gmm_PLEC$covariance_matrices, gmm_PLEC$weights)
pr_PLEC$cluster_labels = 1 - pr_PLEC$cluster_labels
table(pr_PLEC$cluster_labels,pulsarness_labels$pulsarness)
##    
##        0    1
##   0 2925  135
##   1  343  113
# Get the confusion matrix and statistics
Best_threshold_train <- ROC_threshold_plots_tables(pulsarness_labels$pulsarness, #FGL4_AGNPSR_train_labels, 
                                                        pr_PLEC$cluster_proba[,2],
                                                        pulsarness_labels$pulsarness,
                                                        pr_PLEC$cluster_proba[,1], pos=1, neg=0)

##                    real_category
## Predict_class_train    0    1
##                   0  938  200
##                   1 2330   48
##                   real_category
## Predict_class_test    0    1
##                  0 3162  241
##                  1  106    7
cm_gmm_PLEC<- confusionMatrix(factor(pr_PLEC$cluster_labels), factor(c(pulsarness_labels$pulsarness)), positive="1")
print(cm_gmm_PLEC)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2925  135
##          1  343  113
##                                           
##                Accuracy : 0.8641          
##                  95% CI : (0.8523, 0.8752)
##     No Information Rate : 0.9295          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2527          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.45565         
##             Specificity : 0.89504         
##          Pos Pred Value : 0.24781         
##          Neg Pred Value : 0.95588         
##              Prevalence : 0.07053         
##          Detection Rate : 0.03214         
##    Detection Prevalence : 0.12969         
##       Balanced Accuracy : 0.67534         
##                                           
##        'Positive' Class : 1               
## 
##### ##### ##### ##### ##### ##### ##### ##### ##### 
# select the group of variable using LP spectrum type
FGL4_AGNPSR_LP <-  select(FGL4_AGNPSR, -starts_with("PLEC_"), - starts_with("PL_"), -pulsarness)
FGL4_AGNPSR_LP_mat <- as.matrix(FGL4_AGNPSR_LP)
gmm_LP = GMM(FGL4_AGNPSR_LP_mat, 2, "eucl_dist", "random_subset", 10, 10, verbose = TRUE)
## gmm_diag::learn(): generating initial means
## gmm_diag::learn(): k-means: n_threads: 1
## gmm_diag::learn(): k-means: iteration:    1   delta: 2.18713e+06
## gmm_diag::learn(): k-means: iteration:    2   delta: 389794
## gmm_diag::learn(): k-means: iteration:    3   delta: 198033
## gmm_diag::learn(): k-means: iteration:    4   delta: 125302
## gmm_diag::learn(): k-means: iteration:    5   delta: 84560.1
## gmm_diag::learn(): k-means: iteration:    6   delta: 64403.8
## gmm_diag::learn(): k-means: iteration:    7   delta: 48697.3
## gmm_diag::learn(): k-means: iteration:    8   delta: 41072.8
## gmm_diag::learn(): k-means: iteration:    9   delta: 33630.5
## gmm_diag::learn(): k-means: iteration:   10   delta: 27406.7
## gmm_diag::learn(): generating initial covariances
## gmm_diag::learn(): EM: n_threads: 1
## gmm_diag::learn(): EM: iteration:    1   avg_log_p: -24.4655
## gmm_diag::learn(): EM: iteration:    2   avg_log_p: -24.0082
## gmm_diag::learn(): EM: iteration:    3   avg_log_p: -23.7745
## gmm_diag::learn(): EM: iteration:    4   avg_log_p: -23.642
## gmm_diag::learn(): EM: iteration:    5   avg_log_p: -23.5801
## gmm_diag::learn(): EM: iteration:    6   avg_log_p: -23.5476
## gmm_diag::learn(): EM: iteration:    7   avg_log_p: -23.5284
## gmm_diag::learn(): EM: iteration:    8   avg_log_p: -23.5172
## gmm_diag::learn(): EM: iteration:    9   avg_log_p: -23.5103
## gmm_diag::learn(): EM: iteration:   10   avg_log_p: -23.5058
## 
## time to complete : 0.00570633
pr_LP = predict_GMM(FGL4_AGNPSR_LP_mat, gmm_LP$centroids, gmm_LP$covariance_matrices, gmm_LP$weights)
pr_LP$cluster_labels = 1 - pr_LP$cluster_labels
table(pr_LP$cluster_labels,pulsarness_labels$pulsarness)
##    
##        0    1
##   0 2408   67
##   1  860  181
# Get the confusion matrix and statistics
Best_threshold_train <- ROC_threshold_plots_tables(pulsarness_labels$pulsarness, #FGL4_AGNPSR_train_labels, 
                                                        pr_LP$cluster_proba[,2],
                                                        pulsarness_labels$pulsarness,
                                                        pr_LP$cluster_proba[,1], pos=1, neg=0)

##                    real_category
## Predict_class_train    0    1
##                   0 1154  206
##                   1 2114   42
##                   real_category
## Predict_class_test    0    1
##                  0 2628  173
##                  1  640   75
cm_gmm_LP<- confusionMatrix(factor(pr_LP$cluster_labels), factor(c(pulsarness_labels$pulsarness)), positive="1")
print(cm_gmm_LP)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2408   67
##          1  860  181
##                                           
##                Accuracy : 0.7363          
##                  95% CI : (0.7214, 0.7509)
##     No Information Rate : 0.9295          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1884          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.72984         
##             Specificity : 0.73684         
##          Pos Pred Value : 0.17387         
##          Neg Pred Value : 0.97293         
##              Prevalence : 0.07053         
##          Detection Rate : 0.05148         
##    Detection Prevalence : 0.29608         
##       Balanced Accuracy : 0.73334         
##                                           
##        'Positive' Class : 1               
## 
# Save current workspace for subsequent steps
save.image()