0. Import des données
- Les quelques bibliothèques qu’on va utiliser :
library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
- Lecture du jeu de données :
house_sales <- readr::read_csv('../static/home_data.csv', show_col_types = FALSE)
- À quoi ressemblent les données ?
head(house_sales)
## # A tibble: 6 × 21
## id date price bedrooms bathrooms sqft_living sqft_lot
## <chr> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7129300520 2014-10-13 00:00:00 221900 3 1 1180 5650
## 2 6414100192 2014-12-09 00:00:00 538000 3 2.25 2570 7242
## 3 5631500400 2015-02-25 00:00:00 180000 2 1 770 10000
## 4 2487200875 2014-12-09 00:00:00 604000 4 3 1960 5000
## 5 1954400510 2015-02-18 00:00:00 510000 3 2 1680 8080
## 6 7237550310 2014-05-12 00:00:00 1225000 4 4.5 5420 101930
## # ℹ 14 more variables: floors <dbl>, waterfront <dbl>, view <dbl>,
## # condition <dbl>, grade <dbl>, sqft_above <dbl>, sqft_basement <dbl>,
## # yr_built <dbl>, yr_renovated <dbl>, zipcode <dbl>, lat <dbl>, long <dbl>,
## # sqft_living15 <dbl>, sqft_lot15 <dbl>
dim(house_sales)
## [1] 21613 21
- Quels sont les types qui ont été détectés automatiquement ?
str(house_sales)
## spc_tbl_ [21,613 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : chr [1:21613] "7129300520" "6414100192" "5631500400" "2487200875" ...
## $ date : POSIXct[1:21613], format: "2014-10-13" "2014-12-09" ...
## $ price : num [1:21613] 221900 538000 180000 604000 510000 ...
## $ bedrooms : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : num [1:21613] 1180 2570 770 1960 1680 ...
## $ sqft_lot : num [1:21613] 5650 7242 10000 5000 8080 ...
## $ floors : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ view : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : num [1:21613] 1180 2170 770 1050 1680 ...
## $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : num [1:21613] 1955 1951 1933 1965 1987 ...
## $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
## $ zipcode : num [1:21613] 98178 98125 98028 98136 98074 ...
## $ lat : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num [1:21613] -122 -122 -122 -122 -122 ...
## $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
## $ sqft_lot15 : num [1:21613] 5650 7639 8062 5000 7503 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_character(),
## .. date = col_datetime(format = ""),
## .. price = col_double(),
## .. bedrooms = col_double(),
## .. bathrooms = col_double(),
## .. sqft_living = col_double(),
## .. sqft_lot = col_double(),
## .. floors = col_double(),
## .. waterfront = col_double(),
## .. view = col_double(),
## .. condition = col_double(),
## .. grade = col_double(),
## .. sqft_above = col_double(),
## .. sqft_basement = col_double(),
## .. yr_built = col_double(),
## .. yr_renovated = col_double(),
## .. zipcode = col_double(),
## .. lat = col_double(),
## .. long = col_double(),
## .. sqft_living15 = col_double(),
## .. sqft_lot15 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
- On va convertir le champ
date
en un objet Date qui
permettra les comparaisons et les champs waterfront
et
grade
champs de type factor
:
house_sales <- house_sales %>%
mutate(date = as_date(date)) %>%
mutate(waterfront = as.factor(waterfront)) %>%
mutate(grade = as.factor(grade))
- On va convertir les champs
sq ft
en mètres carrés et
supprimer les champs d’origine (avec .keep="unused"
) :
house_sales <- house_sales %>%
mutate(
sqm_above = sqft_above / 10.764,
sqm_living = sqft_living / 10.764,
sqm_lot = sqft_lot / 10.764,
sqm_basement = sqft_basement / 10.764,
sqm_living15 = sqft_living15 / 10.764,
sqm_lot15 = sqft_lot15 / 10.764,
.keep="unused"
)
1. Explorer les données
- Quelle distribution des prix ?

- Combien de maisons par nombre d’étages ?
house_sales %>% count(floors)
## # A tibble: 6 × 2
## floors n
## <dbl> <int>
## 1 1 10680
## 2 1.5 1910
## 3 2 8241
## 4 2.5 161
## 5 3 613
## 6 3.5 8
ggplot(house_sales, aes(x = floors)) +
geom_bar() +
labs(title="House count by number of floors") +
theme_light()

- Comparer les prix des maisons avec vue sur le front de mer ou sans
vue sur le front de mer :
ggplot(house_sales, aes(waterfront, price)) +
geom_boxplot(width = 1) +
theme_light()

- Quelle distribution des prix au regard des notes données à chaque
maisons ?
ggplot(house_sales, aes(grade, price)) +
geom_boxplot(width = 1) +
labs(title="Price distribution across grade levels") +
theme_light()

- Corrélation par paires :
library(corrr)
correlate(house_sales)
## Non-numeric variables removed from input: `id`, `date`, `waterfront`, and `grade`
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
## # A tibble: 17 × 18
## term price bedrooms bathrooms floors view condition yr_built
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 price NA 0.308 0.525 0.257 0.397 0.0364 0.0540
## 2 bedrooms 0.308 NA 0.516 0.175 0.0795 0.0285 0.154
## 3 bathrooms 0.525 0.516 NA 0.501 0.188 -0.125 0.506
## 4 floors 0.257 0.175 0.501 NA 0.0294 -0.264 0.489
## 5 view 0.397 0.0795 0.188 0.0294 NA 0.0460 -0.0534
## 6 condition 0.0364 0.0285 -0.125 -0.264 0.0460 NA -0.361
## 7 yr_built 0.0540 0.154 0.506 0.489 -0.0534 -0.361 NA
## 8 yr_renovated 0.126 0.0188 0.0507 0.00634 0.104 -0.0606 -0.225
## 9 zipcode -0.0532 -0.153 -0.204 -0.0591 0.0848 0.00303 -0.347
## 10 lat 0.307 -0.00893 0.0246 0.0496 0.00616 -0.0149 -0.148
## 11 long 0.0216 0.129 0.223 0.125 -0.0784 -0.107 0.409
## 12 sqm_above 0.606 0.478 0.685 0.524 0.168 -0.158 0.424
## 13 sqm_living 0.702 0.577 0.755 0.354 0.285 -0.0588 0.318
## 14 sqm_lot 0.0897 0.0317 0.0877 -0.00520 0.0747 -0.00896 0.0531
## 15 sqm_basement 0.324 0.303 0.284 -0.246 0.277 0.174 -0.133
## 16 sqm_living15 0.585 0.392 0.569 0.280 0.280 -0.0928 0.326
## 17 sqm_lot15 0.0824 0.0292 0.0872 -0.0113 0.0726 -0.00341 0.0710
## # ℹ 10 more variables: yr_renovated <dbl>, zipcode <dbl>, lat <dbl>,
## # long <dbl>, sqm_above <dbl>, sqm_living <dbl>, sqm_lot <dbl>,
## # sqm_basement <dbl>, sqm_living15 <dbl>, sqm_lot15 <dbl>
2. Régression linéaire
model <- lm('price ~ sqm_living', house_sales)
summary(model)
##
## Call:
## lm(formula = "price ~ sqm_living", data = house_sales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1476062 -147486 -24043 106182 4362067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43580.74 4402.69 -9.899 <2e-16 ***
## sqm_living 3020.63 20.84 144.920 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 261500 on 21611 degrees of freedom
## Multiple R-squared: 0.4929, Adjusted R-squared: 0.4928
## F-statistic: 2.1e+04 on 1 and 21611 DF, p-value: < 2.2e-16
ggplot(house_sales, aes(x=sqm_living, y=price)) +
geom_point() +
geom_smooth(method='lm', formula = "y ~ x") +
labs(title="House price vs. sqm living") +
theme_light()
