# 1) Installing necessary libraries if(!require(tidyverse)) install.packages("tidyverse") # Install tidyverse if not present if(!require(ggplot2)) install.packages("ggplot2") # Install ggplot2 if not present if(!require(caret)) install.packages("caret") # Install caret package if not present if(!require(pROC)) install.packages("pROC") # Install pROC for ROC curve and AUC calculation library(tidyverse) # Load tidyverse package for data manipulation library(ggplot2) # Load ggplot2 for plotting library(caret) # Load caret for modeling utilities library(pROC) # Load pROC for ROC curve analysis # 2) Preparing data based on true randomly drawn coefficients set.seed(123) # Set seed for reproducibility n <- 5000 # Set number of samples to generate gender <- sample(c("Male", "Female"), n, replace = TRUE) # Randomly assign gender area <- sample(c("Urban", "Rural"), n, replace = TRUE) # Randomly assign area age <- round(runif(n, 18, 80)) # Generate random ages between 18 and 80 horse_power <- rnorm(n, mean = 150, sd = 40) # Generate horse power values from a normal distribution horse_power <- ifelse(horse_power < 60, 60, horse_power) # Set a lower limit of 60 for horsepower # Generate hidden true coefficients true_intercept <- -4.5 # Set true intercept true_gender_coef <- runif(1, 0.01, 0.03) # Random coefficient for gender effect true_area_coef <- runif(1, 0.015, 0.03) # Random coefficient for area effect true_age_coef <- runif(1, -0.008, -0.002) # Random coefficient for age effect true_age_over_65_coef <- runif(1, 0.03, 0.06) # Increased risk over 65 true_hp_coef <- runif(1, 0.001, 0.003) # Random coefficient for horse power effect # Calculate true probability linear_pred <- true_intercept + true_gender_coef * (gender == "Male") + # Add contribution from gender true_area_coef * (area == "Urban") + # Add contribution from area true_age_coef * age + # Add contribution from age true_hp_coef * horse_power # Add contribution from horse power prob_accident <- exp(linear_pred) / (1 + exp(linear_pred)) # Convert linear predictor to probability had_accident <- rbinom(n, size = 1, prob = prob_accident) # Generate accident outcomes based on probability data <- data.frame(gender, area, age, horse_power, had_accident) # Combine into a dataframe summary(data) # 3) Model itself with comments explaining its steps model <- glm(had_accident ~ gender + area + age + horse_power, data = data, family = "binomial") # Fitting the model summary(model) # Display summary of model results predicted_risk <- predict(model, type = "response") # Predict accident probabilities for each observation data$predicted_risk <- predicted_risk # Add predictions to the dataframe # 4) Plotting risk by variables ggplot(data, aes(x = gender, y = predicted_risk)) + # Plot risk by gender geom_boxplot() + ggtitle("Predicted Accident Risk by Gender") ggplot(data, aes(x = area, y = predicted_risk)) + # Plot risk by area geom_boxplot() + ggtitle("Predicted Accident Risk by Area") ggplot(data, aes(x = age, y = predicted_risk)) + # Scatter plot of risk by age geom_point(alpha = 0.2) + geom_smooth() + ggtitle("Predicted Accident Risk by Age") ggplot(data, aes(x = horse_power, y = predicted_risk)) + # Scatter plot of risk by horse power geom_point(alpha = 0.2) + geom_smooth() + ggtitle("Predicted Accident Risk by Horse Power") # 5) Checking model's predictive power roc_obj <- roc(data$had_accident, data$predicted_risk) # Create ROC object aic_value <- AIC(model) # Calculate AIC for model quality assessment auc_value <- auc(roc_obj) # Calculate the Area Under the Curve (AUC) print(paste("Model AIC:", aic_value)) # Print AIC value print(paste("Model AUC:", auc_value)) # Print AUC value ###### #AUC = Area Under the ROC Curve #It measures how well the model discriminates between two classes (in your case: accident vs. no accident). #The value ranges from 0.5 to 1: #0.5 = no better than random guessing #0.6–0.7 = weak discrimination #0.7–0.8 = acceptable #0.8–0.9 = excellent #0.9–1.0 = outstanding ###### # Plot the ROC curve plot(roc_obj, main = "ROC Curve for Accident Risk Model") # Adding new policy holder new_driver <- data.frame( gender = "Female", area = "Rural", age = 18, horse_power = 260 ) # Predict probability of accident for the new record predicted_probability <- predict(model, newdata = new_driver, type = "response") # Print result cat("The predicted probability of an accident for this new driver is:", round(predicted_probability, 4), "\n") # Asssuming you've already added predicted_risk column: expected_accidents <- sum(data$predicted_risk) cat("The expected number of accidents out of", nrow(data), "observations is approximately:", round(expected_accidents), "\n")