0. Import des données

library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
house_sales <- readr::read_csv('../static/home_data.csv', show_col_types = FALSE)
head(house_sales)
## # A tibble: 6 × 21
##   id         date                  price bedrooms bathrooms sqft_living sqft_lot
##   <chr>      <dttm>                <dbl>    <dbl>     <dbl>       <dbl>    <dbl>
## 1 7129300520 2014-10-13 00:00:00  221900        3      1           1180     5650
## 2 6414100192 2014-12-09 00:00:00  538000        3      2.25        2570     7242
## 3 5631500400 2015-02-25 00:00:00  180000        2      1            770    10000
## 4 2487200875 2014-12-09 00:00:00  604000        4      3           1960     5000
## 5 1954400510 2015-02-18 00:00:00  510000        3      2           1680     8080
## 6 7237550310 2014-05-12 00:00:00 1225000        4      4.5         5420   101930
## # ℹ 14 more variables: floors <dbl>, waterfront <dbl>, view <dbl>,
## #   condition <dbl>, grade <dbl>, sqft_above <dbl>, sqft_basement <dbl>,
## #   yr_built <dbl>, yr_renovated <dbl>, zipcode <dbl>, lat <dbl>, long <dbl>,
## #   sqft_living15 <dbl>, sqft_lot15 <dbl>
dim(house_sales)
## [1] 21613    21
str(house_sales)
## spc_tbl_ [21,613 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id           : chr [1:21613] "7129300520" "6414100192" "5631500400" "2487200875" ...
##  $ date         : POSIXct[1:21613], format: "2014-10-13" "2014-12-09" ...
##  $ price        : num [1:21613] 221900 538000 180000 604000 510000 ...
##  $ bedrooms     : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num [1:21613] 1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num [1:21613] 5650 7242 10000 5000 8080 ...
##  $ floors       : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num [1:21613] 1180 2170 770 1050 1680 ...
##  $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num [1:21613] 1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
##  $ zipcode      : num [1:21613] 98178 98125 98028 98136 98074 ...
##  $ lat          : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num [1:21613] -122 -122 -122 -122 -122 ...
##  $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
##  $ sqft_lot15   : num [1:21613] 5650 7639 8062 5000 7503 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_character(),
##   ..   date = col_datetime(format = ""),
##   ..   price = col_double(),
##   ..   bedrooms = col_double(),
##   ..   bathrooms = col_double(),
##   ..   sqft_living = col_double(),
##   ..   sqft_lot = col_double(),
##   ..   floors = col_double(),
##   ..   waterfront = col_double(),
##   ..   view = col_double(),
##   ..   condition = col_double(),
##   ..   grade = col_double(),
##   ..   sqft_above = col_double(),
##   ..   sqft_basement = col_double(),
##   ..   yr_built = col_double(),
##   ..   yr_renovated = col_double(),
##   ..   zipcode = col_double(),
##   ..   lat = col_double(),
##   ..   long = col_double(),
##   ..   sqft_living15 = col_double(),
##   ..   sqft_lot15 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
house_sales <- house_sales %>%
  mutate(date = as_date(date)) %>%
  mutate(waterfront = as.factor(waterfront)) %>%
  mutate(grade = as.factor(grade))
house_sales <- house_sales %>%
  mutate(
    sqm_above = sqft_above / 10.764,
    sqm_living = sqft_living / 10.764,
    sqm_lot = sqft_lot / 10.764,
    sqm_basement = sqft_basement / 10.764,
    sqm_living15 = sqft_living15 / 10.764,
    sqm_lot15 = sqft_lot15 / 10.764,
    .keep="unused"
  )

1. Explorer les données

house_sales %>% count(floors)
## # A tibble: 6 × 2
##   floors     n
##    <dbl> <int>
## 1    1   10680
## 2    1.5  1910
## 3    2    8241
## 4    2.5   161
## 5    3     613
## 6    3.5     8
ggplot(house_sales, aes(x = floors)) +
  geom_bar() +
  labs(title="House count by number of floors") +
  theme_light()

ggplot(house_sales, aes(waterfront, price)) +
  geom_boxplot(width = 1) +
  theme_light()

ggplot(house_sales, aes(grade, price)) +
  geom_boxplot(width = 1) +
  labs(title="Price distribution across grade levels") +
  theme_light()

library(corrr)

correlate(house_sales)
## Non-numeric variables removed from input: `id`, `date`, `waterfront`, and `grade`
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
## # A tibble: 17 × 18
##    term           price bedrooms bathrooms   floors     view condition yr_built
##    <chr>          <dbl>    <dbl>     <dbl>    <dbl>    <dbl>     <dbl>    <dbl>
##  1 price        NA       0.308      0.525   0.257    0.397     0.0364    0.0540
##  2 bedrooms      0.308  NA          0.516   0.175    0.0795    0.0285    0.154 
##  3 bathrooms     0.525   0.516     NA       0.501    0.188    -0.125     0.506 
##  4 floors        0.257   0.175      0.501  NA        0.0294   -0.264     0.489 
##  5 view          0.397   0.0795     0.188   0.0294  NA         0.0460   -0.0534
##  6 condition     0.0364  0.0285    -0.125  -0.264    0.0460   NA        -0.361 
##  7 yr_built      0.0540  0.154      0.506   0.489   -0.0534   -0.361    NA     
##  8 yr_renovated  0.126   0.0188     0.0507  0.00634  0.104    -0.0606   -0.225 
##  9 zipcode      -0.0532 -0.153     -0.204  -0.0591   0.0848    0.00303  -0.347 
## 10 lat           0.307  -0.00893    0.0246  0.0496   0.00616  -0.0149   -0.148 
## 11 long          0.0216  0.129      0.223   0.125   -0.0784   -0.107     0.409 
## 12 sqm_above     0.606   0.478      0.685   0.524    0.168    -0.158     0.424 
## 13 sqm_living    0.702   0.577      0.755   0.354    0.285    -0.0588    0.318 
## 14 sqm_lot       0.0897  0.0317     0.0877 -0.00520  0.0747   -0.00896   0.0531
## 15 sqm_basement  0.324   0.303      0.284  -0.246    0.277     0.174    -0.133 
## 16 sqm_living15  0.585   0.392      0.569   0.280    0.280    -0.0928    0.326 
## 17 sqm_lot15     0.0824  0.0292     0.0872 -0.0113   0.0726   -0.00341   0.0710
## # ℹ 10 more variables: yr_renovated <dbl>, zipcode <dbl>, lat <dbl>,
## #   long <dbl>, sqm_above <dbl>, sqm_living <dbl>, sqm_lot <dbl>,
## #   sqm_basement <dbl>, sqm_living15 <dbl>, sqm_lot15 <dbl>

2. Régression linéaire

model <- lm('price ~ sqm_living', house_sales)
summary(model)
## 
## Call:
## lm(formula = "price ~ sqm_living", data = house_sales)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1476062  -147486   -24043   106182  4362067 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43580.74    4402.69  -9.899   <2e-16 ***
## sqm_living    3020.63      20.84 144.920   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 261500 on 21611 degrees of freedom
## Multiple R-squared:  0.4929, Adjusted R-squared:  0.4928 
## F-statistic: 2.1e+04 on 1 and 21611 DF,  p-value: < 2.2e-16
ggplot(house_sales, aes(x=sqm_living, y=price)) +
  geom_point() +
  geom_smooth(method='lm', formula = "y ~ x")  +
  labs(title="House price vs. sqm living") +
  theme_light()