Classification On 4fgl (part2)

Classification on 4FGL (Part2)

Note that the R library keras conflicts with other R libraries. Better remove other unnecessary R packages for this script.

Step 2 (as in part 1). Function to calculate AUC as an evaluation metric, get best threshold and plot ROC curves

# Load workspace from previous step
library(keras)
#install_keras()
library(pROC)
load("FGL4_results.rData")
load(".RData")

kerasAUC <- function(y_true, y_pred){
    true= k_flatten(y_true)
    pred = k_flatten(y_pred)
  
    #total number of elements in this batch
    totalCount = k_shape(true)[1]
  
    #sorting the prediction values in descending order
    values = tensorflow::tf$nn$top_k(pred,k=totalCount)
    indices<-values[[1]]
    values<-values[[0]]
  
    #sorting the ground truth values based on the predictions above         
    sortedTrue = k_gather(true, indices)
  
    #getting the ground negative elements (already sorted above)
    negatives = 1 - sortedTrue
  
    #the true positive count per threshold
    TPCurve = k_cumsum(sortedTrue)
  
    #area under the curve
    auc = k_sum(TPCurve * negatives)
  
    #normalizing the result between 0 and 1
    totalCount = k_cast(totalCount, k_floatx())
    positiveCount = k_sum(true)
    negativeCount = totalCount - positiveCount
    totalArea = positiveCount * negativeCount
    return  (auc / totalArea)
}

ROC_threshold_plots_tables <- function(truth_train, prediction_train, 
                                       truth_test, prediction_test, 
                                       threshold = 0, pos = "Pulsar", neg = "Non-Pulsar", print_table=TRUE) {
        if (threshold == 0) {
                # Compute best threshold if none is provided
                ROC <- roc(truth_train, prediction_train, levels = c(pos,neg))
                ROC_table <- cbind(ROC$thresholds, ROC$specificities, ROC$sensitivities) 
                best_threshold <- ROC_table[which.max(ROC_table[, 2] + ROC_table[, 3]), ]
        } else {
                # Don't compute threshold if one is provided 
                best_threshold <- threshold
        }
        if (nargs() > 2) {
                # Make ROC plots
                par(mfrow = c(1, 2))
                ROC_train <- roc(truth_train, prediction_train, levels = c(pos,neg),
                        plot = TRUE, print.auc = TRUE, main = "ROC Train", print.thres = "best") 
                ROC_test <- roc(truth_test, prediction_test, levels = c(pos,neg),
                        plot = TRUE, print.auc = TRUE, main = "ROC Test", print.thres = "best")
                
                if (print_table) {
                        # Generate contingency tables
                        # Train data
                        nrow_train <- length(prediction_train)
                        Predict_class_train <- rep("NA", nrow_train)
                        Predict_class_train <- ifelse(prediction_train > best_threshold[1], pos, neg) 
                        real_category <- truth_train
                        print(table(Predict_class_train, real_category))
                        
                        # Test data
                        nrow_test <- length(prediction_test)
                        Predict_class_test <- rep("NA", nrow_test)
                        Predict_class_test <- ifelse(prediction_test > best_threshold[1], pos, neg) 
                        real_category <- truth_test
                        print(table(Predict_class_test, real_category))
                }
        } 
        return(best_threshold)
}

# Save current workspace for subsequent steps
# save.image()

Step 7. AGN vs PSR classification using ANN

# load(".RData")
# library(keras)
y_train <- to_categorical(smote_AGNPSR_train_labels, 2) # agn = 1, pulsar = 2
y_test<- to_categorical(FGL4_AGNPSR_test_labels,2)

x_train <- as.data.frame(smote_PLEC_train[, -ncol(smote_PLEC_train)])
x_train <- as.matrix(x_train)

x_test <- as.matrix(FGL4_AGNPSR_PLEC_test)
#head(smote_PLEC_train)
#table(smote_PLEC_train$pulsarness)

# create metric using backend tensor functions
metric_kerasAUC <- custom_metric("kerasAUC", function(y_true, y_pred) {
  kerasAUC(y_true, y_pred)
})

model <- keras_model_sequential() 
model %>% 
  layer_dense(units = 14, # Positive integer, dimensionality of the output space.
              activation = 'relu', bias_regularizer = regularizer_l2(0.005),
              input_shape = c(ncol(smote_PLEC_train)-1)) %>% 
  layer_dropout(rate = 0.3) %>% 
  layer_dense(units = 6, activation = 'relu', bias_regularizer = regularizer_l2(0.01)) %>%
  layer_dropout(rate = 0.2) %>%
  layer_dense(units = 2, activation = 'softmax')

summary(model)

model %>% compile(
  loss = 'categorical_crossentropy',
  optimizer = optimizer_rmsprop(lr = 0.01),
  metrics = c(metric_kerasAUC, 'accuracy')
)

history <- model %>% fit(
  x_train, y_train, 
  epochs = 100, # batch_size = 128, 
  validation_split = 0.2
)

model %>% save_model_hdf5("my_model_best.h5")

y_train_pred <- model %>% predict_classes(x_train)
y_test_pred <- model %>% predict_classes(x_test)
y_train_p <- model %>%  predict_proba(x_train)
y_test_p <- model %>%  predict_proba(x_test)
# score <- model %>% evaluate(x_test, y_test)
# factor(y_test_pred)

Best_threshold_train_PLEC <- ROC_threshold_plots_tables(c(smote_AGNPSR_train_labels), #FGL4_AGNPSR_train_labels, 
                                                        y_train_p[,1],
                                                        c(FGL4_AGNPSR_test_labels), 
                                                        y_test_p[,1], pos=1, neg = 0)

##                    real_category
## Predict_class_train    0    1
##                   0  170 2333
##                   1 2050   72
##                   real_category
## Predict_class_test   0   1
##                  0  58  58
##                  1 934   5

# Get the confusion matrix and statistics
library(caret)
#y_test_pred <- ifelse (y_test_pred > Best_threshold_train_PLEC[1],1,0)
cm_ann_PLEC<- confusionMatrix(factor(y_test_pred), factor(c(FGL4_AGNPSR_test_labels)), positive="1")
print(cm_ann_PLEC)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 938   5
##          1  54  58
##                                           
##                Accuracy : 0.9441          
##                  95% CI : (0.9285, 0.9572)
##     No Information Rate : 0.9403          
##     P-Value [Acc > NIR] : 0.3302          
##                                           
##                   Kappa : 0.635           
##                                           
##  Mcnemar's Test P-Value : 4.129e-10       
##                                           
##             Sensitivity : 0.92063         
##             Specificity : 0.94556         
##          Pos Pred Value : 0.51786         
##          Neg Pred Value : 0.99470         
##              Prevalence : 0.05972         
##          Detection Rate : 0.05498         
##    Detection Prevalence : 0.10616         
##       Balanced Accuracy : 0.93310         
##                                           
##        'Positive' Class : 1               
##

# Add ANN Prediction probabilities to FGL4_results
FGL4_tidy_num <- select(FGL4_tidy, -CLASS1, -ASSOC1, -ASSOC2, -CLASS2, 
                      -SpectrumType, -Source_Name, -ASSOC_FGL, -GLAT, -Conf_95_PosAng, -agnness)
FGL4_tidy_num_PLEC <- select(FGL4_tidy_num, -starts_with("PL_"), - starts_with("LP_"), -pulsarness)
FGL4_mat <- as.matrix(FGL4_tidy_num_PLEC)
FGL4_results$ANN_Pred_PLEC <- model %>% predict_classes(FGL4_mat)
FGL4_results$ANN_P_PLEC <- model %>%  predict_proba(FGL4_mat)

##### ##### ##### ##### ##### ##### ##### ##### ##### 
# select the group of variable using LP spectrum type
x_train_LP <- as.data.frame(smote_LP_train[, -ncol(smote_LP_train)])
x_train_LP <- as.matrix(x_train_LP)
x_test_LP <- as.matrix(FGL4_AGNPSR_LP_test)

model <- keras_model_sequential() 
model %>% 
  layer_dense(units = 12, # Positive integer, dimensionality of the output space.
              activation = 'relu', bias_regularizer = regularizer_l2(0.005),
              input_shape = c(ncol(smote_LP_train)-1)) %>% 
  layer_dropout(rate = 0.3) %>% 
  layer_dense(units = 6, activation = 'relu', bias_regularizer = regularizer_l2(0.01)) %>%
  layer_dropout(rate = 0.2) %>%
  layer_dense(units = 2, activation = 'softmax')

summary(model)

model %>% compile(
  loss = 'categorical_crossentropy',
  optimizer = optimizer_rmsprop(lr = 0.01),
  metrics = c(metric_kerasAUC, 'accuracy')
)

history <- model %>% fit(
  x_train, y_train, 
  epochs = 100, # batch_size = 128, 
  validation_split = 0.2
)

model %>% save_model_hdf5("my_model_best_LP.h5")

y_train_pred_LP <- model %>% predict_classes(x_train_LP)
y_test_pred_LP <- model %>% predict_classes(x_test_LP)
y_train_p_LP <- model %>%  predict_proba(x_train_LP)
y_test_p_LP <- model %>%  predict_proba(x_test_LP)
#score <- model %>% evaluate(x_test, y_test)
# factor(y_test_pred)

Best_threshold_train_PLEC <- ROC_threshold_plots_tables(c(smote_AGNPSR_train_labels), #FGL4_AGNPSR_train_labels, 
                                                        y_train_p_LP[,1],
                                                        c(FGL4_AGNPSR_test_labels), 
                                                        y_test_p_LP[,1], pos=1, neg = 0)

##                    real_category
## Predict_class_train    0    1
##                   0  234 2343
##                   1 1986   62
##                   real_category
## Predict_class_test   0   1
##                  0 100  62
##                  1 892   1

# Get the confusion matrix and statistics
#y_test_pred <- ifelse (y_test_pred > Best_threshold_train_PLEC[1],1,0)
cm_ann_LP<- confusionMatrix(factor(y_test_pred_LP), factor(c(FGL4_AGNPSR_test_labels)), positive="1")
print(cm_ann_LP)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 891   1
##          1 101  62
##                                           
##                Accuracy : 0.9033          
##                  95% CI : (0.8839, 0.9205)
##     No Information Rate : 0.9403          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.5061          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.98413         
##             Specificity : 0.89819         
##          Pos Pred Value : 0.38037         
##          Neg Pred Value : 0.99888         
##              Prevalence : 0.05972         
##          Detection Rate : 0.05877         
##    Detection Prevalence : 0.15450         
##       Balanced Accuracy : 0.94116         
##                                           
##        'Positive' Class : 1               
##

# Add ANN Prediction probabilities to FGL4_results
FGL4_tidy_num_LP <- select(FGL4_tidy_num, -starts_with("PL_"), - starts_with("PLEC_"), -pulsarness)
FGL4_mat <- as.matrix(FGL4_tidy_num_LP)
FGL4_results$ANN_Pred_LP <- model %>% predict_classes(FGL4_mat)
FGL4_results$ANN_P_LP <- model %>%  predict_proba(FGL4_mat)

save(FGL4_results, file = "FGL4_results.rData", compress = "xz")

Step 8. AGN vs PSR classification using Gaussian Mixture Model

library(ClusterR)
load("fermicatsR/FGL4_tidy.rData")
library(caret)
library(dplyr)

pulsarness_labels <- select(FGL4_AGNPSR, pulsarness)
#convert labels to pulsar = 1, non-pulsar = 0
pulsarness_labels$pulsarness <- as.numeric(pulsarness_labels$pulsarness=="Pulsar")

##### ##### ##### ##### ##### ##### ##### ##### ##### 
# select the group of variable using PLEC spectrum type
FGL4_AGNPSR_PLEC <-  select(FGL4_AGNPSR, -starts_with("PL_"), - starts_with("LP_"), -pulsarness)
FGL4_AGNPSR_PLEC_mat <- as.matrix(FGL4_AGNPSR_PLEC)
gmm_PLEC = GMM(FGL4_AGNPSR_PLEC_mat, 2, "eucl_dist", "random_subset", 10, 10, verbose = TRUE)

## gmm_diag::learn(): generating initial means
## gmm_diag::learn(): k-means: n_threads: 1
## gmm_diag::learn(): k-means: iteration:    1   delta: 2.18713e+06
## gmm_diag::learn(): k-means: iteration:    2   delta: 389794
## gmm_diag::learn(): k-means: iteration:    3   delta: 198033
## gmm_diag::learn(): k-means: iteration:    4   delta: 125302
## gmm_diag::learn(): k-means: iteration:    5   delta: 84560.1
## gmm_diag::learn(): k-means: iteration:    6   delta: 64403.8
## gmm_diag::learn(): k-means: iteration:    7   delta: 48697.3
## gmm_diag::learn(): k-means: iteration:    8   delta: 41072.8
## gmm_diag::learn(): k-means: iteration:    9   delta: 33630.5
## gmm_diag::learn(): k-means: iteration:   10   delta: 27406.7
## gmm_diag::learn(): generating initial covariances
## gmm_diag::learn(): EM: n_threads: 1
## gmm_diag::learn(): EM: iteration:    1   avg_log_p: -11.64
## gmm_diag::learn(): EM: iteration:    2   avg_log_p: -11.5138
## gmm_diag::learn(): EM: iteration:    3   avg_log_p: -11.4754
## gmm_diag::learn(): EM: iteration:    4   avg_log_p: -11.4581
## gmm_diag::learn(): EM: iteration:    5   avg_log_p: -11.4499
## gmm_diag::learn(): EM: iteration:    6   avg_log_p: -11.4447
## gmm_diag::learn(): EM: iteration:    7   avg_log_p: -11.441
## gmm_diag::learn(): EM: iteration:    8   avg_log_p: -11.4389
## gmm_diag::learn(): EM: iteration:    9   avg_log_p: -11.4374
## gmm_diag::learn(): EM: iteration:   10   avg_log_p: -11.4365
## 
## time to complete : 0.00670757

pr_PLEC = predict_GMM(FGL4_AGNPSR_PLEC_mat, gmm_PLEC$centroids, gmm_PLEC$covariance_matrices, gmm_PLEC$weights)
pr_PLEC$cluster_labels = 1 - pr_PLEC$cluster_labels
table(pr_PLEC$cluster_labels,pulsarness_labels$pulsarness)

##    
##        0    1
##   0 2925  135
##   1  343  113

# Get the confusion matrix and statistics
Best_threshold_train <- ROC_threshold_plots_tables(pulsarness_labels$pulsarness, #FGL4_AGNPSR_train_labels, 
                                                        pr_PLEC$cluster_proba[,2],
                                                        pulsarness_labels$pulsarness,
                                                        pr_PLEC$cluster_proba[,1], pos=1, neg=0)

##                    real_category
## Predict_class_train    0    1
##                   0  938  200
##                   1 2330   48
##                   real_category
## Predict_class_test    0    1
##                  0 3162  241
##                  1  106    7

cm_gmm_PLEC<- confusionMatrix(factor(pr_PLEC$cluster_labels), factor(c(pulsarness_labels$pulsarness)), positive="1")
print(cm_gmm_PLEC)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2925  135
##          1  343  113
##                                           
##                Accuracy : 0.8641          
##                  95% CI : (0.8523, 0.8752)
##     No Information Rate : 0.9295          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2527          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.45565         
##             Specificity : 0.89504         
##          Pos Pred Value : 0.24781         
##          Neg Pred Value : 0.95588         
##              Prevalence : 0.07053         
##          Detection Rate : 0.03214         
##    Detection Prevalence : 0.12969         
##       Balanced Accuracy : 0.67534         
##                                           
##        'Positive' Class : 1               
##

##### ##### ##### ##### ##### ##### ##### ##### ##### 
# select the group of variable using LP spectrum type
FGL4_AGNPSR_LP <-  select(FGL4_AGNPSR, -starts_with("PLEC_"), - starts_with("PL_"), -pulsarness)
FGL4_AGNPSR_LP_mat <- as.matrix(FGL4_AGNPSR_LP)
gmm_LP = GMM(FGL4_AGNPSR_LP_mat, 2, "eucl_dist", "random_subset", 10, 10, verbose = TRUE)

## gmm_diag::learn(): generating initial means
## gmm_diag::learn(): k-means: n_threads: 1
## gmm_diag::learn(): k-means: iteration:    1   delta: 2.18713e+06
## gmm_diag::learn(): k-means: iteration:    2   delta: 389794
## gmm_diag::learn(): k-means: iteration:    3   delta: 198033
## gmm_diag::learn(): k-means: iteration:    4   delta: 125302
## gmm_diag::learn(): k-means: iteration:    5   delta: 84560.1
## gmm_diag::learn(): k-means: iteration:    6   delta: 64403.8
## gmm_diag::learn(): k-means: iteration:    7   delta: 48697.3
## gmm_diag::learn(): k-means: iteration:    8   delta: 41072.8
## gmm_diag::learn(): k-means: iteration:    9   delta: 33630.5
## gmm_diag::learn(): k-means: iteration:   10   delta: 27406.7
## gmm_diag::learn(): generating initial covariances
## gmm_diag::learn(): EM: n_threads: 1
## gmm_diag::learn(): EM: iteration:    1   avg_log_p: -24.4655
## gmm_diag::learn(): EM: iteration:    2   avg_log_p: -24.0082
## gmm_diag::learn(): EM: iteration:    3   avg_log_p: -23.7745
## gmm_diag::learn(): EM: iteration:    4   avg_log_p: -23.642
## gmm_diag::learn(): EM: iteration:    5   avg_log_p: -23.5801
## gmm_diag::learn(): EM: iteration:    6   avg_log_p: -23.5476
## gmm_diag::learn(): EM: iteration:    7   avg_log_p: -23.5284
## gmm_diag::learn(): EM: iteration:    8   avg_log_p: -23.5172
## gmm_diag::learn(): EM: iteration:    9   avg_log_p: -23.5103
## gmm_diag::learn(): EM: iteration:   10   avg_log_p: -23.5058
## 
## time to complete : 0.00570633

pr_LP = predict_GMM(FGL4_AGNPSR_LP_mat, gmm_LP$centroids, gmm_LP$covariance_matrices, gmm_LP$weights)
pr_LP$cluster_labels = 1 - pr_LP$cluster_labels
table(pr_LP$cluster_labels,pulsarness_labels$pulsarness)

##    
##        0    1
##   0 2408   67
##   1  860  181

# Get the confusion matrix and statistics
Best_threshold_train <- ROC_threshold_plots_tables(pulsarness_labels$pulsarness, #FGL4_AGNPSR_train_labels, 
                                                        pr_LP$cluster_proba[,2],
                                                        pulsarness_labels$pulsarness,
                                                        pr_LP$cluster_proba[,1], pos=1, neg=0)

##                    real_category
## Predict_class_train    0    1
##                   0 1154  206
##                   1 2114   42
##                   real_category
## Predict_class_test    0    1
##                  0 2628  173
##                  1  640   75

cm_gmm_LP<- confusionMatrix(factor(pr_LP$cluster_labels), factor(c(pulsarness_labels$pulsarness)), positive="1")
print(cm_gmm_LP)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2408   67
##          1  860  181
##                                           
##                Accuracy : 0.7363          
##                  95% CI : (0.7214, 0.7509)
##     No Information Rate : 0.9295          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1884          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.72984         
##             Specificity : 0.73684         
##          Pos Pred Value : 0.17387         
##          Neg Pred Value : 0.97293         
##              Prevalence : 0.07053         
##          Detection Rate : 0.05148         
##    Detection Prevalence : 0.29608         
##       Balanced Accuracy : 0.73334         
##                                           
##        'Positive' Class : 1               
##

# Save current workspace for subsequent steps
save.image()

Classification On 4fgl (part2)

Classification on 4FGL (Part2)

Via Artificial Neural Network and Gaussian Mixture Model

SH

Other links

Step 2 (as in part 1). Function to calculate AUC as an evaluation metric, get best threshold and plot ROC curves

Step 7. AGN vs PSR classification using ANN

Step 8. AGN vs PSR classification using Gaussian Mixture Model