# Titanic: skrypt startowy do zajec z ggplot2 # Uruchom ten plik przed pisaniem wykresow na zywo. library(tidyverse) library(scales) theme_set(theme_minimal(base_size = 14)) if (requireNamespace("rstudioapi", quietly = TRUE) && rstudioapi::isAvailable()) { script_path <- rstudioapi::getSourceEditorContext()$path if (nzchar(script_path)) { setwd(dirname(script_path)) } } dir.create("data", showWarnings = FALSE) data_path <- "data/train.csv" if (!file.exists(data_path)) { url <- "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv" download.file(url, data_path, mode = "wb", quiet = TRUE) } train <- read_csv(data_path, show_col_types = FALSE) message("Katalog pracy: ", getwd()) message("Plik danych: ", normalizePath(data_path)) titanic <- train %>% mutate( Survived = factor( Survived, levels = c(0, 1), labels = c("Nie przeżył", "Przeżył") ), Pclass = factor( Pclass, levels = c(1, 2, 3), labels = c("1 klasa", "2 klasa", "3 klasa") ), Sex = factor( Sex, levels = c("female", "male"), labels = c("kobieta", "mężczyzna") ), Embarked = factor( Embarked, levels = c("C", "Q", "S"), labels = c("Cherbourg", "Queenstown", "Southampton") ), Title = gsub("(.*, )|(\\..*)", "", Name), Title = case_when( Title %in% c("Mlle", "Ms") ~ "Miss", Title == "Mme" ~ "Mrs", Title %in% c("Mr", "Miss", "Mrs", "Master") ~ Title, TRUE ~ "Inny" ), Title = factor(Title, levels = c("Mr", "Miss", "Mrs", "Master", "Inny")), FamilySize = SibSp + Parch + 1, FamilyGroup = case_when( FamilySize == 1 ~ "samodzielnie", FamilySize <= 4 ~ "mała rodzina", TRUE ~ "duża rodzina" ), FamilyGroup = factor( FamilyGroup, levels = c("samodzielnie", "mała rodzina", "duża rodzina") ), AgeGroup = case_when( is.na(Age) ~ "brak wieku", Age < 12 ~ "dziecko", Age < 18 ~ "nastolatek", Age < 40 ~ "młody dorosły", Age < 60 ~ "dorosły", TRUE ~ "senior" ), AgeGroup = factor( AgeGroup, levels = c( "dziecko", "nastolatek", "młody dorosły", "dorosły", "senior", "brak wieku" ) ) ) survival_palette <- c("Nie przeżył" = "#d94f45", "Przeżył" = "#2f6f8f") class_palette <- c("1 klasa" = "#2f6f8f", "2 klasa" = "#f2a541", "3 klasa" = "#7a869a") missing_data <- tibble( Variable = names(train), Missing = colSums(is.na(train)) ) %>% filter(Missing > 0) %>% arrange(desc(Missing)) glimpse(titanic) missing_data # Od tego miejsca piszemy wykresy: