Classification on 4FGL (Part2)
Via Artificial Neural Network and Gaussian Mixture Model
SH
Note that the R library keras conflicts with other R libraries. Better remove other unnecessary R packages for this script.
Other links
This is Script 3.
Script 2: Classification on 4FGL2 (Part1) here.
Script 1: Comparing FGL3 unclassified predictions and FGL4 discoveries. See here.
Script 0: Data pre-processing. FGL4_tidy_for_FGL3.rData" are processed 4FGL data catalog. See the processing (and preliminary feature selection via correlation) steps here.
Step 2 (as in part 1). Function to calculate AUC as an evaluation metric, get best threshold and plot ROC curves
# Load workspace from previous step
library(keras)
#install_keras()
library(pROC)
load("FGL4_results.rData")
load(".RData")
kerasAUC <- function(y_true, y_pred){
true= k_flatten(y_true)
pred = k_flatten(y_pred)
#total number of elements in this batch
totalCount = k_shape(true)[1]
#sorting the prediction values in descending order
values = tensorflow::tf$nn$top_k(pred,k=totalCount)
indices<-values[[1]]
values<-values[[0]]
#sorting the ground truth values based on the predictions above
sortedTrue = k_gather(true, indices)
#getting the ground negative elements (already sorted above)
negatives = 1 - sortedTrue
#the true positive count per threshold
TPCurve = k_cumsum(sortedTrue)
#area under the curve
auc = k_sum(TPCurve * negatives)
#normalizing the result between 0 and 1
totalCount = k_cast(totalCount, k_floatx())
positiveCount = k_sum(true)
negativeCount = totalCount - positiveCount
totalArea = positiveCount * negativeCount
return (auc / totalArea)
}
ROC_threshold_plots_tables <- function(truth_train, prediction_train,
truth_test, prediction_test,
threshold = 0, pos = "Pulsar", neg = "Non-Pulsar", print_table=TRUE) {
if (threshold == 0) {
# Compute best threshold if none is provided
ROC <- roc(truth_train, prediction_train, levels = c(pos,neg))
ROC_table <- cbind(ROC$thresholds, ROC$specificities, ROC$sensitivities)
best_threshold <- ROC_table[which.max(ROC_table[, 2] + ROC_table[, 3]), ]
} else {
# Don't compute threshold if one is provided
best_threshold <- threshold
}
if (nargs() > 2) {
# Make ROC plots
par(mfrow = c(1, 2))
ROC_train <- roc(truth_train, prediction_train, levels = c(pos,neg),
plot = TRUE, print.auc = TRUE, main = "ROC Train", print.thres = "best")
ROC_test <- roc(truth_test, prediction_test, levels = c(pos,neg),
plot = TRUE, print.auc = TRUE, main = "ROC Test", print.thres = "best")
if (print_table) {
# Generate contingency tables
# Train data
nrow_train <- length(prediction_train)
Predict_class_train <- rep("NA", nrow_train)
Predict_class_train <- ifelse(prediction_train > best_threshold[1], pos, neg)
real_category <- truth_train
print(table(Predict_class_train, real_category))
# Test data
nrow_test <- length(prediction_test)
Predict_class_test <- rep("NA", nrow_test)
Predict_class_test <- ifelse(prediction_test > best_threshold[1], pos, neg)
real_category <- truth_test
print(table(Predict_class_test, real_category))
}
}
return(best_threshold)
}
# Save current workspace for subsequent steps
# save.image()
Step 7. AGN vs PSR classification using ANN
# load(".RData")
# library(keras)
y_train <- to_categorical(smote_AGNPSR_train_labels, 2) # agn = 1, pulsar = 2
y_test<- to_categorical(FGL4_AGNPSR_test_labels,2)
x_train <- as.data.frame(smote_PLEC_train[, -ncol(smote_PLEC_train)])
x_train <- as.matrix(x_train)
x_test <- as.matrix(FGL4_AGNPSR_PLEC_test)
#head(smote_PLEC_train)
#table(smote_PLEC_train$pulsarness)
# create metric using backend tensor functions
metric_kerasAUC <- custom_metric("kerasAUC", function(y_true, y_pred) {
kerasAUC(y_true, y_pred)
})
model <- keras_model_sequential()
model %>%
layer_dense(units = 14, # Positive integer, dimensionality of the output space.
activation = 'relu', bias_regularizer = regularizer_l2(0.005),
input_shape = c(ncol(smote_PLEC_train)-1)) %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 6, activation = 'relu', bias_regularizer = regularizer_l2(0.01)) %>%
layer_dropout(rate = 0.2) %>%
layer_dense(units = 2, activation = 'softmax')
summary(model)
model %>% compile(
loss = 'categorical_crossentropy',
optimizer = optimizer_rmsprop(lr = 0.01),
metrics = c(metric_kerasAUC, 'accuracy')
)
history <- model %>% fit(
x_train, y_train,
epochs = 100, # batch_size = 128,
validation_split = 0.2
)
model %>% save_model_hdf5("my_model_best.h5")
y_train_pred <- model %>% predict_classes(x_train)
y_test_pred <- model %>% predict_classes(x_test)
y_train_p <- model %>% predict_proba(x_train)
y_test_p <- model %>% predict_proba(x_test)
# score <- model %>% evaluate(x_test, y_test)
# factor(y_test_pred)
Best_threshold_train_PLEC <- ROC_threshold_plots_tables(c(smote_AGNPSR_train_labels), #FGL4_AGNPSR_train_labels,
y_train_p[,1],
c(FGL4_AGNPSR_test_labels),
y_test_p[,1], pos=1, neg = 0)
## real_category
## Predict_class_train 0 1
## 0 170 2333
## 1 2050 72
## real_category
## Predict_class_test 0 1
## 0 58 58
## 1 934 5
# Get the confusion matrix and statistics
library(caret)
#y_test_pred <- ifelse (y_test_pred > Best_threshold_train_PLEC[1],1,0)
cm_ann_PLEC<- confusionMatrix(factor(y_test_pred), factor(c(FGL4_AGNPSR_test_labels)), positive="1")
print(cm_ann_PLEC)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 938 5
## 1 54 58
##
## Accuracy : 0.9441
## 95% CI : (0.9285, 0.9572)
## No Information Rate : 0.9403
## P-Value [Acc > NIR] : 0.3302
##
## Kappa : 0.635
##
## Mcnemar's Test P-Value : 4.129e-10
##
## Sensitivity : 0.92063
## Specificity : 0.94556
## Pos Pred Value : 0.51786
## Neg Pred Value : 0.99470
## Prevalence : 0.05972
## Detection Rate : 0.05498
## Detection Prevalence : 0.10616
## Balanced Accuracy : 0.93310
##
## 'Positive' Class : 1
##
# Add ANN Prediction probabilities to FGL4_results
FGL4_tidy_num <- select(FGL4_tidy, -CLASS1, -ASSOC1, -ASSOC2, -CLASS2,
-SpectrumType, -Source_Name, -ASSOC_FGL, -GLAT, -Conf_95_PosAng, -agnness)
FGL4_tidy_num_PLEC <- select(FGL4_tidy_num, -starts_with("PL_"), - starts_with("LP_"), -pulsarness)
FGL4_mat <- as.matrix(FGL4_tidy_num_PLEC)
FGL4_results$ANN_Pred_PLEC <- model %>% predict_classes(FGL4_mat)
FGL4_results$ANN_P_PLEC <- model %>% predict_proba(FGL4_mat)
##### ##### ##### ##### ##### ##### ##### ##### #####
# select the group of variable using LP spectrum type
x_train_LP <- as.data.frame(smote_LP_train[, -ncol(smote_LP_train)])
x_train_LP <- as.matrix(x_train_LP)
x_test_LP <- as.matrix(FGL4_AGNPSR_LP_test)
model <- keras_model_sequential()
model %>%
layer_dense(units = 12, # Positive integer, dimensionality of the output space.
activation = 'relu', bias_regularizer = regularizer_l2(0.005),
input_shape = c(ncol(smote_LP_train)-1)) %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 6, activation = 'relu', bias_regularizer = regularizer_l2(0.01)) %>%
layer_dropout(rate = 0.2) %>%
layer_dense(units = 2, activation = 'softmax')
summary(model)
model %>% compile(
loss = 'categorical_crossentropy',
optimizer = optimizer_rmsprop(lr = 0.01),
metrics = c(metric_kerasAUC, 'accuracy')
)
history <- model %>% fit(
x_train, y_train,
epochs = 100, # batch_size = 128,
validation_split = 0.2
)
model %>% save_model_hdf5("my_model_best_LP.h5")
y_train_pred_LP <- model %>% predict_classes(x_train_LP)
y_test_pred_LP <- model %>% predict_classes(x_test_LP)
y_train_p_LP <- model %>% predict_proba(x_train_LP)
y_test_p_LP <- model %>% predict_proba(x_test_LP)
#score <- model %>% evaluate(x_test, y_test)
# factor(y_test_pred)
Best_threshold_train_PLEC <- ROC_threshold_plots_tables(c(smote_AGNPSR_train_labels), #FGL4_AGNPSR_train_labels,
y_train_p_LP[,1],
c(FGL4_AGNPSR_test_labels),
y_test_p_LP[,1], pos=1, neg = 0)
## real_category
## Predict_class_train 0 1
## 0 234 2343
## 1 1986 62
## real_category
## Predict_class_test 0 1
## 0 100 62
## 1 892 1
# Get the confusion matrix and statistics
#y_test_pred <- ifelse (y_test_pred > Best_threshold_train_PLEC[1],1,0)
cm_ann_LP<- confusionMatrix(factor(y_test_pred_LP), factor(c(FGL4_AGNPSR_test_labels)), positive="1")
print(cm_ann_LP)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 891 1
## 1 101 62
##
## Accuracy : 0.9033
## 95% CI : (0.8839, 0.9205)
## No Information Rate : 0.9403
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.5061
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.98413
## Specificity : 0.89819
## Pos Pred Value : 0.38037
## Neg Pred Value : 0.99888
## Prevalence : 0.05972
## Detection Rate : 0.05877
## Detection Prevalence : 0.15450
## Balanced Accuracy : 0.94116
##
## 'Positive' Class : 1
##
# Add ANN Prediction probabilities to FGL4_results
FGL4_tidy_num_LP <- select(FGL4_tidy_num, -starts_with("PL_"), - starts_with("PLEC_"), -pulsarness)
FGL4_mat <- as.matrix(FGL4_tidy_num_LP)
FGL4_results$ANN_Pred_LP <- model %>% predict_classes(FGL4_mat)
FGL4_results$ANN_P_LP <- model %>% predict_proba(FGL4_mat)
save(FGL4_results, file = "FGL4_results.rData", compress = "xz")
Step 8. AGN vs PSR classification using Gaussian Mixture Model
library(ClusterR)
load("fermicatsR/FGL4_tidy.rData")
library(caret)
library(dplyr)
pulsarness_labels <- select(FGL4_AGNPSR, pulsarness)
#convert labels to pulsar = 1, non-pulsar = 0
pulsarness_labels$pulsarness <- as.numeric(pulsarness_labels$pulsarness=="Pulsar")
##### ##### ##### ##### ##### ##### ##### ##### #####
# select the group of variable using PLEC spectrum type
FGL4_AGNPSR_PLEC <- select(FGL4_AGNPSR, -starts_with("PL_"), - starts_with("LP_"), -pulsarness)
FGL4_AGNPSR_PLEC_mat <- as.matrix(FGL4_AGNPSR_PLEC)
gmm_PLEC = GMM(FGL4_AGNPSR_PLEC_mat, 2, "eucl_dist", "random_subset", 10, 10, verbose = TRUE)
## gmm_diag::learn(): generating initial means
## gmm_diag::learn(): k-means: n_threads: 1
## gmm_diag::learn(): k-means: iteration: 1 delta: 2.18713e+06
## gmm_diag::learn(): k-means: iteration: 2 delta: 389794
## gmm_diag::learn(): k-means: iteration: 3 delta: 198033
## gmm_diag::learn(): k-means: iteration: 4 delta: 125302
## gmm_diag::learn(): k-means: iteration: 5 delta: 84560.1
## gmm_diag::learn(): k-means: iteration: 6 delta: 64403.8
## gmm_diag::learn(): k-means: iteration: 7 delta: 48697.3
## gmm_diag::learn(): k-means: iteration: 8 delta: 41072.8
## gmm_diag::learn(): k-means: iteration: 9 delta: 33630.5
## gmm_diag::learn(): k-means: iteration: 10 delta: 27406.7
## gmm_diag::learn(): generating initial covariances
## gmm_diag::learn(): EM: n_threads: 1
## gmm_diag::learn(): EM: iteration: 1 avg_log_p: -11.64
## gmm_diag::learn(): EM: iteration: 2 avg_log_p: -11.5138
## gmm_diag::learn(): EM: iteration: 3 avg_log_p: -11.4754
## gmm_diag::learn(): EM: iteration: 4 avg_log_p: -11.4581
## gmm_diag::learn(): EM: iteration: 5 avg_log_p: -11.4499
## gmm_diag::learn(): EM: iteration: 6 avg_log_p: -11.4447
## gmm_diag::learn(): EM: iteration: 7 avg_log_p: -11.441
## gmm_diag::learn(): EM: iteration: 8 avg_log_p: -11.4389
## gmm_diag::learn(): EM: iteration: 9 avg_log_p: -11.4374
## gmm_diag::learn(): EM: iteration: 10 avg_log_p: -11.4365
##
## time to complete : 0.00670757
pr_PLEC = predict_GMM(FGL4_AGNPSR_PLEC_mat, gmm_PLEC$centroids, gmm_PLEC$covariance_matrices, gmm_PLEC$weights)
pr_PLEC$cluster_labels = 1 - pr_PLEC$cluster_labels
table(pr_PLEC$cluster_labels,pulsarness_labels$pulsarness)
##
## 0 1
## 0 2925 135
## 1 343 113
# Get the confusion matrix and statistics
Best_threshold_train <- ROC_threshold_plots_tables(pulsarness_labels$pulsarness, #FGL4_AGNPSR_train_labels,
pr_PLEC$cluster_proba[,2],
pulsarness_labels$pulsarness,
pr_PLEC$cluster_proba[,1], pos=1, neg=0)
## real_category
## Predict_class_train 0 1
## 0 938 200
## 1 2330 48
## real_category
## Predict_class_test 0 1
## 0 3162 241
## 1 106 7
cm_gmm_PLEC<- confusionMatrix(factor(pr_PLEC$cluster_labels), factor(c(pulsarness_labels$pulsarness)), positive="1")
print(cm_gmm_PLEC)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2925 135
## 1 343 113
##
## Accuracy : 0.8641
## 95% CI : (0.8523, 0.8752)
## No Information Rate : 0.9295
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2527
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.45565
## Specificity : 0.89504
## Pos Pred Value : 0.24781
## Neg Pred Value : 0.95588
## Prevalence : 0.07053
## Detection Rate : 0.03214
## Detection Prevalence : 0.12969
## Balanced Accuracy : 0.67534
##
## 'Positive' Class : 1
##
##### ##### ##### ##### ##### ##### ##### ##### #####
# select the group of variable using LP spectrum type
FGL4_AGNPSR_LP <- select(FGL4_AGNPSR, -starts_with("PLEC_"), - starts_with("PL_"), -pulsarness)
FGL4_AGNPSR_LP_mat <- as.matrix(FGL4_AGNPSR_LP)
gmm_LP = GMM(FGL4_AGNPSR_LP_mat, 2, "eucl_dist", "random_subset", 10, 10, verbose = TRUE)
## gmm_diag::learn(): generating initial means
## gmm_diag::learn(): k-means: n_threads: 1
## gmm_diag::learn(): k-means: iteration: 1 delta: 2.18713e+06
## gmm_diag::learn(): k-means: iteration: 2 delta: 389794
## gmm_diag::learn(): k-means: iteration: 3 delta: 198033
## gmm_diag::learn(): k-means: iteration: 4 delta: 125302
## gmm_diag::learn(): k-means: iteration: 5 delta: 84560.1
## gmm_diag::learn(): k-means: iteration: 6 delta: 64403.8
## gmm_diag::learn(): k-means: iteration: 7 delta: 48697.3
## gmm_diag::learn(): k-means: iteration: 8 delta: 41072.8
## gmm_diag::learn(): k-means: iteration: 9 delta: 33630.5
## gmm_diag::learn(): k-means: iteration: 10 delta: 27406.7
## gmm_diag::learn(): generating initial covariances
## gmm_diag::learn(): EM: n_threads: 1
## gmm_diag::learn(): EM: iteration: 1 avg_log_p: -24.4655
## gmm_diag::learn(): EM: iteration: 2 avg_log_p: -24.0082
## gmm_diag::learn(): EM: iteration: 3 avg_log_p: -23.7745
## gmm_diag::learn(): EM: iteration: 4 avg_log_p: -23.642
## gmm_diag::learn(): EM: iteration: 5 avg_log_p: -23.5801
## gmm_diag::learn(): EM: iteration: 6 avg_log_p: -23.5476
## gmm_diag::learn(): EM: iteration: 7 avg_log_p: -23.5284
## gmm_diag::learn(): EM: iteration: 8 avg_log_p: -23.5172
## gmm_diag::learn(): EM: iteration: 9 avg_log_p: -23.5103
## gmm_diag::learn(): EM: iteration: 10 avg_log_p: -23.5058
##
## time to complete : 0.00570633
pr_LP = predict_GMM(FGL4_AGNPSR_LP_mat, gmm_LP$centroids, gmm_LP$covariance_matrices, gmm_LP$weights)
pr_LP$cluster_labels = 1 - pr_LP$cluster_labels
table(pr_LP$cluster_labels,pulsarness_labels$pulsarness)
##
## 0 1
## 0 2408 67
## 1 860 181
# Get the confusion matrix and statistics
Best_threshold_train <- ROC_threshold_plots_tables(pulsarness_labels$pulsarness, #FGL4_AGNPSR_train_labels,
pr_LP$cluster_proba[,2],
pulsarness_labels$pulsarness,
pr_LP$cluster_proba[,1], pos=1, neg=0)
## real_category
## Predict_class_train 0 1
## 0 1154 206
## 1 2114 42
## real_category
## Predict_class_test 0 1
## 0 2628 173
## 1 640 75
cm_gmm_LP<- confusionMatrix(factor(pr_LP$cluster_labels), factor(c(pulsarness_labels$pulsarness)), positive="1")
print(cm_gmm_LP)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2408 67
## 1 860 181
##
## Accuracy : 0.7363
## 95% CI : (0.7214, 0.7509)
## No Information Rate : 0.9295
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1884
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.72984
## Specificity : 0.73684
## Pos Pred Value : 0.17387
## Neg Pred Value : 0.97293
## Prevalence : 0.07053
## Detection Rate : 0.05148
## Detection Prevalence : 0.29608
## Balanced Accuracy : 0.73334
##
## 'Positive' Class : 1
##
# Save current workspace for subsequent steps
save.image()