We read in the two CSV files for each year. The ‘properties_2016/2017.csv’ files contain the home characteristic variables. The ‘train_2016/2017.csv’ files contain the log error of the difference between the Zestimate for the house and the actual selling price. Both files contain a unique parcel ID for each house, which we use to into two dataframes, one for each year. We then row bind these two dataframes. Our final dataframe is called ‘full_union’.
properties_2016 = read.csv('properties_2016.csv')
train_2016 = read.csv('train_2016_v2.csv')
properties_2017 = read.csv('properties_2017.csv')
train_2017 = read.csv('train_2017.csv')
full_data_2016 = merge(properties_2016, train_2016, by = 'parcelid')
full_data_2017 = merge(properties_2017, train_2017, by = 'parcelid')
full_union = rbind(full_data_2016,full_data_2017)
After scanning the dataset, we saw that the dataset contained a lot of NAs, some variables contain many more than others. We decided to keep variables that had a small number of NAs or NAs that made sense to impute. For example, it would be difficult to impute region ID zipcode or finished square footage. Imputing these variables might actually mislead our models. The ‘on_hold’ list contains variables that we chose not to impute right now but could be useful to try to incorporate into a model later on. The variables that we removed were consistent with what we observed in other notebooks, as well.
sapply(full_union, function(x) sum(is.na(x)))
## parcelid airconditioningtypeid
## 0 114100
## architecturalstyletypeid basementsqft
## 167420 167795
## bathroomcnt bedroomcnt
## 34 34
## buildingclasstypeid buildingqualitytypeid
## 167857 60715
## calculatedbathnbr decktypeid
## 1832 166616
## finishedfloor1squarefeet calculatedfinishedsquarefeet
## 154995 896
## finishedsquarefeet12 finishedsquarefeet13
## 8369 167813
## finishedsquarefeet15 finishedsquarefeet50
## 161297 154995
## finishedsquarefeet6 fips
## 167081 34
## fireplacecnt fullbathcnt
## 149992 1832
## garagecarcnt garagetotalsqft
## 112431 112431
## hashottuborspa heatingorsystemtypeid
## 0 62237
## latitude longitude
## 34 34
## lotsizesquarefeet poolcnt
## 18442 133813
## poolsizesum pooltypeid10
## 166050 166262
## pooltypeid2 pooltypeid7
## 165610 136112
## propertycountylandusecode propertylandusetypeid
## 0 34
## propertyzoningdesc rawcensustractandblock
## 0 34
## regionidcity regionidcounty
## 3309 34
## regionidneighborhood regionidzip
## 100902 119
## roomcnt storytypeid
## 34 167795
## threequarterbathnbr typeconstructiontypeid
## 145773 167366
## unitcnt yardbuildingsqft17
## 58832 162849
## yardbuildingsqft26 yearbuilt
## 167723 1060
## numberofstories fireplaceflag
## 129719 0
## structuretaxvaluedollarcnt taxvaluedollarcnt
## 529 36
## assessmentyear landtaxvaluedollarcnt
## 34 37
## taxamount taxdelinquencyflag
## 45 0
## taxdelinquencyyear censustractandblock
## 163205 886
## logerror transactiondate
## 0 0
names_usable = c('parcelid', 'bathroomcnt','bedroomcnt', 'buildingqualitytypeid','calculatedfinishedsquarefeet','fullbathcnt', 'hashottuborspa','heatingorsystemtypeid', 'latitude', 'longitude', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'yearbuilt', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt','landtaxvaluedollarcnt','taxamount', 'taxdelinquencyflag', 'decktypeid', 'airconditioningtypeid', 'finishedsquarefeet12', 'fireplacecnt', 'poolcnt', 'pooltypeid2', 'yardbuildingsqft17', 'yardbuildingsqft26', 'fireplaceflag','logerror')
remove_list = c('calculatedbathnbr', 'finishedsquarefeet12','fips', 'propertycountylandusecode','propertyzoningdesc', 'rawcensustractandblock', 'regionidzip', 'assessmentyear', 'censustractandblock','architecturalstyletypeid','basementsqft', 'buildingclasstypeid', 'finishedfloor1squarefeet', 'finishedsquarefeet13', 'finishedsquarefeet15', 'finishedsquarefeet50','finishedsquarefeet6', 'poolsizesum', 'pooltypeid10', 'pooltypeid7', 'regionidzip', 'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid', 'taxdelinquencyyear' )
on_hold = c('lotsizesquarefeet', 'garagecnt', 'garagetotalsqft','regionidneighborhood', 'roomcnt', 'unitcnt', 'numberofstories', 'transactiondate')
For simplicity sake, we decided to impute NAs in any variable with the mean value of all values in that variable. We do this for bathroom count, bedroom count, building quality, total living area and number of full baths. Additionally, we turn the hot tub variable into a factor, impute NAs in heating system with a 14 (which indicates ‘Other’) and convert latitude and longitude values to real lat/long values by dividing by 10000000.
mean_impute = function(data){
var_mean = floor(mean(data[!is.na(data)])) #find mean
data[is.na(data)] = var_mean #set NA values to mean
}
#Bathroom Counts
full_union$bathroomcnt[is.na(full_union$bathroomcnt)] = mean_impute(data = full_union$bathroomcnt)
#Bedroom Counts
full_union$bedroomcnt[is.na(full_union$bedroomcnt)] = mean_impute(data = full_union$bedroomcnt)
#Buidling Quality
full_union$buildingqualitytypeid[is.na(full_union$buildingqualitytypeid)] = mean_impute(data = full_union$buildingqualitytypeid)
#total living area - calculatedfinishedsquarefeet
full_union$calculatedfinishedsquarefeet[is.na(full_union$calculatedfinishedsquarefeet)] = mean_impute(data = full_union$calculatedfinishedsquarefeet)
#number of full baths (shower,bathtub,sink,toilet)
full_union$fullbathcnt[is.na(full_union$fullbathcnt)] = mean_impute(data = full_union$fullbathcnt)
#convert to factor with a 1 if it does have a hottub or spa and 0 otherwise
levels(full_union$hashottuborspa)[levels(full_union$hashottuborspa) == c('','true')] = c(0,1)
#Heating or system Id change NA to 'Other' ID
full_union$heatingorsystemtypeid[is.na(full_union$heatingorsystemtypeid)] = 14
#Convert Lat and Lon to real lat lon values
full_union$latitude = full_union$latitude/10000000
full_union$longitude = full_union$longitude/10000000
We make additional data imputations here. Each variable had to be assessed on it’s own. It would have been easy to drop any column with a lot of NAs but many variables allowed for assumptions to be made, for example a fireplace with a NA could be assumed to have no fireplace so we changed the value to 0. We engineered new features, such as yard building square footage, which measured patio size. We assumed NAs meant that there was no patio so we dropped the original variable and made a categorical variable that indicated whether there was a patio or not. We also perform categorical encoding on categorical data variables.
#change levels of taxdelinquencyflag to be 0 if no flag, 1 if flag
levels(full_union$taxdelinquencyflag)[levels(full_union$taxdelinquencyflag) == c('','Y')] = c(0,1)
# Turn decktype id into either a yes or a no encoded categorical
full_union$decktypeid[!is.na(full_union$decktypeid)] = 1
full_union$decktypeid[is.na(full_union$decktypeid)] = 0
full_union$decktypeid = factor(full_union$decktypeid)
#change NAs to other and 'Yes' type to other, then make categorical encoding
full_union$airconditioningtypeid[is.na(full_union$airconditioningtypeid)] = 6
full_union$airconditioningtypeid[full_union$airconditioningtypeid == 13] = 6
full_union$airconditioningtypeid = factor(full_union$airconditioningtypeid)
#Make NAs in fireplace count 0
full_union$fireplacecnt[is.na(full_union$fireplacecnt)] = 0
#Make NAs in poolcount 0
full_union$poolcnt[is.na(full_union$poolcnt)] = 0
#convert pooltypeid2 into categorical yes or no
full_union$pooltypeid2[is.na(full_union$pooltypeid2)] = 0
full_union$pooltypeid2 = factor(full_union$pooltypeid2)
#change the patio squarefeet in a yard to whether a yard has a patio or not - categorical encode
full_union$yardbuildingsqft17[!is.na(full_union$yardbuildingsqft17)] = 1
full_union$yardbuildingsqft17[is.na(full_union$yardbuildingsqft17)] = 0
full_union$yardbuildingsqft17 = factor(full_union$yardbuildingsqft17)
#same as patio, but for storage shed
full_union$yardbuildingsqft26[!is.na(full_union$yardbuildingsqft26)] = 1
full_union$yardbuildingsqft26[is.na(full_union$yardbuildingsqft26)] = 0
full_union$yardbuildingsqft26 = factor(full_union$yardbuildingsqft26)
#additional data on whether or not a house has a fireplace - categorical encode
levels(full_union$fireplaceflag)[levels(full_union$fireplaceflag) == c('','true')] = c(0,1)
#convert year built to age
full_union$yearbuilt = 2020 - full_union$yearbuilt
#convert heatingsystem id to categorical - change 'yes', NAs to other
full_union$heatingorsystemtypeid[full_union$heatingorsystemtypeid == 24] = 14
full_union$heatingorsystemtypeid[is.na(full_union$heatingorsystemtypeid)] = 14
full_union$heatingorsystemtypeid = factor(full_union$heatingorsystemtypeid)
We filter out our dataframe for only the variables we specified earlier as those that are usable.
full_union = full_union[,colnames(full_union) %in% names_usable]
full_union = na.omit(full_union)
We rename some variables to mirror the changes we made in imputation and to be more descriptive of their true nature.
full_union = full_union %>% rename(
property_age = yearbuilt,
patio = yardbuildingsqft17,
shed = yardbuildingsqft26,
area_total_calc = calculatedfinishedsquarefeet,
living_area_sqft = finishedsquarefeet12,
num_bathroom = bathroomcnt,
num_bedroom = bedroomcnt,
num_full_bath = fullbathcnt,
num_fireplace = fireplacecnt,
num_pool = poolcnt,
pool_hot_or_spa = pooltypeid2,
region_county = regionidcounty,
region_city = regionidcity,
tax_total = taxvaluedollarcnt,
tax_building = structuretaxvaluedollarcnt,
tax_land = landtaxvaluedollarcnt,
tax_property = taxamount,
tax_delinquency = taxdelinquencyflag,
zoning_landuse = propertylandusetypeid,
flag_fireplace = fireplaceflag,
tub_or_spa = hashottuborspa,
building_quality = buildingqualitytypeid,
deck = decktypeid,
heating_system = heatingorsystemtypeid,
ac_type = airconditioningtypeid,
)
write.csv(full_union, 'Zillow_Clean_v4.csv', row.names = F )