library(vroom)
library(tidyverse)
library(sf)Datenbereinigung
Libraries
Importing and cleaning data
ATTENTION: If this project was cloned from GitHub, the INSPIRE_Grid folder must be downloaded from the source below and placed into the right directory. The files were too large for GitHub.
data_rent_complete <- vroom("./daten/Datensatz_Berlin_Rent_new.csv")
data_social_complete <- vroom("./daten/Datensatz_Berlin_Social_new.csv", locale = locale(grouping_mark = ".", decimal_mark = ",", encoding = "UTF-8"))
# Grid is from https://gdz.bkg.bund.de/index.php/default/inspire/sonstige-inspire-themen/geographische-gitter-fur-deutschland-in-lambert-projektion-geogitter-inspire.html
inspire_grid_complete <- st_read("./daten/INSPIRE_Grid/DE_Grid_ETRS89-LAEA_1km.gpkg/geogitter/DE_Grid_ETRS89-LAEA_1km.gpkg")
# Source: https://daten.odis-berlin.de/de/dataset/bezirksgrenzen/
bezirksgrenzen_berlin <- st_read("./daten/bezirksgrenzen_berlin/bezirksgrenzen.shp") %>%
st_transform("EPSG:3035")# Filtering inspire grid to only the squares that fit over Berlin
inspire_grid_berlin <- inspire_grid_complete %>%
st_filter(bezirksgrenzen_berlin) %>%
mutate(r1_id = paste(x_sw / 1000, y_sw / 1000, sep = "_"))
# Removing original data
rm(inspire_grid_complete)
# Saving Berlin grid
st_write(inspire_grid_berlin, "./daten/inspire_grid_berlin.gpkg", append = FALSE)
# Plotting
ggplot()+
geom_sf(data = inspire_grid_berlin)+
geom_sf(data = bezirksgrenzen_berlin, fill = NA)
# Getting unique r1_ids for filtering
r1_ids <- inspire_grid_berlin %>%
pull(r1_id) %>%
unique()# Filtering data_rent
data_rent <- data_rent_complete %>%
select(r1_id, mietekalt, wohnflaeche, jahr) %>%
filter(r1_id %in% r1_ids) %>%
mutate(mietekalt_m2 = mietekalt / wohnflaeche) %>%
filter(mietekalt > 0,
wohnflaeche > 0)
# Removing original data
rm(data_rent_complete)
# Printing
head(data_rent)# Filtering data_social
# Renaming and adding variables
data_social <- data_social_complete %>%
select(r1_id, year, r1_mba_a_haushalt, r1_mps_p_obmittel, r1_mps_p_ober, r1_kkr_w_summe, r1_mbe_p_haustyp_1:r1_mbe_p_haustyp_6, r1_alq_p_quote, r1_met_p_deutschl, r1_mri_p_risiko_1:r1_mri_p_risiko_9, r1_eag_p_m60bis65:r1_eag_p_m75undgr, r1_eag_p_w60bis65:r1_eag_p_w75undgr) %>%
filter(r1_id %in% r1_ids) %>%
mutate_if(is.numeric, ~na_if(., -1)) %>%
mutate(jahr = year,
anzahl_haushalte = r1_mba_a_haushalt,
arbeitslosenquote = r1_alq_p_quote,
anteil_oberklassewagen = r1_mps_p_ober + r1_mps_p_obmittel,
kaufkraft_pro_haushalt = r1_kkr_w_summe / r1_mba_a_haushalt,
anteil_efh = r1_mbe_p_haustyp_1 + r1_mbe_p_haustyp_2,
anteil_mfh = r1_mbe_p_haustyp_3 + r1_mbe_p_haustyp_4,
anteil_wohnblock = r1_mbe_p_haustyp_5 + r1_mbe_p_haustyp_6,
anteil_auslaender = 100 - r1_met_p_deutschl,
kreditrisiko = (r1_mri_p_risiko_1 * 1 + r1_mri_p_risiko_2 * 2 + r1_mri_p_risiko_3 * 3 + r1_mri_p_risiko_4 * 4 + r1_mri_p_risiko_5 * 5 + r1_mri_p_risiko_6 * 6 + r1_mri_p_risiko_7 * 7 + r1_mri_p_risiko_8 * 8 + r1_mri_p_risiko_9 * 9) / 900,
anteil_60_plus = r1_eag_p_m60bis65 + r1_eag_p_m65bis75 + r1_eag_p_m75undgr + r1_eag_p_w60bis65 + r1_eag_p_w65bis75 + r1_eag_p_w75undgr,
.keep = "unused")
# Removing the original data
rm(data_social_complete)
# Printing
head(data_social)# Saving cleaned data
#write_csv(data_rent, "./daten/data_rent.csv")
#write_csv(data_social, "./daten/data_social.csv")