Building a weighted KNN model to predict house prices using python.
Github link for the project and the data: https://github.com/MNoorFawi/weighted-knn-in-python
Import important libraries.
import pandas as pd import numpy as np from sklearn.neighbors import KDTree from sklearn.preprocessing import StandardScaler import seaborn as sns import random
Import data, remove “id” and “date” columns -unnecessary- and make price a separate variable.
data = pd.read_csv("home_data-train.txt", sep = ",", header = None) del data[0] del data[1] test = pd.read_excel("HomePrices-Test.xlsx", header = 0) del test["id"] del test ["date"] data.columns = test.columns train_price = data.price del data["price"] test_price = test.price del test["price"]
See correlations between each variable and price to extract important variables.
sns.set(rc = {'figure.figsize' : (11.7, 8.27)}) corr = pd.concat([train_price, data], axis = 1).corr() corr_map = sns.heatmap(corr, annot = True, fmt = ".1g", cmap = "coolwarm")
Extract important variables and standardize train and test data.
correlated = data.columns[corr.iloc[1:, 0] >= 0.3] scaled = StandardScaler().fit(data[correlated]) train_scaled = scaled.transform(data[correlated]) test_scaled = scaled.transform(test[correlated])
Construct the KDTree and test it.
tree = KDTree(train_scaled) nearest_dist, nearest_ind = tree.query(test_scaled[13].reshape(1, -1), k = 3) print(test.loc[13, correlated], "\n") print(data.loc[nearest_ind[0], correlated], "\n") print("test price: ", test_price[13], "\n") print("train price: \n", list(train_price[nearest_ind[0]])) # bedrooms 2.0000 # bathrooms 2.5000 # sqft_living 1278.0000 # view 0.0000 # grade 7.0000 # sqft_above 1002.0000 # sqft_basement 276.0000 # lat 47.5532 # sqft_living15 1220.0000 # Name: 13, dtype: float64 # # bedrooms bathrooms sqft_living view grade sqft_above \ # 19933 2 2.5 1233 0 7 963 # 9192 2 2.5 1250 0 7 1030 # 18439 2 2.5 1230 0 7 1060 # # sqft_basement lat sqft_living15 # 19933 270 47.5533 1230 # 9192 220 47.5243 1250 # 18439 170 47.6007 1290 # # test price: 358000 # # train price: # [360000, 267100, 380000]
Define the weight function, Gaussian, Subtract Weight and the one we will use Inverse Weight, the Weighted KNN algorithm and the Test Algorithm (RMSE) function.
def inverseweight(dist, num = 1.0, const = 0.1): return num / (dist + const) def gaussian(dist, sigma = 10.0): return math.e ** (- dist ** 2 / ( 2 * sigma ** 2)) def subtractweight(dist, const = 2.0): if dist > const: return 0.001 else: return const - dist def weighted_knn(kdtree, test_point, target, k = 25, weight_fun = inverseweight): nearest_dist, nearest_ind = kdtree.query(test_point, k = k) avg = 0.0 totalweight = 0.0 for i in range(k): dist = nearest_dist[0][i] idx = nearest_ind[0][i] weight = weight_fun(dist) avg += weight * target[idx] totalweight += weight avg = round(avg / totalweight) return avg def testalgorithm(algo, kdtree, testset, target, test_target): error = 0.0 for row in range(len(testset)): guess = algo(kdtree, testset[row].reshape(1, -1), target) error += (test_target[row] - guess) ** 2 return round(np.sqrt(error / len(testset)))
Test the algorithm.
random.seed(1191) ex = random.sample(range(len(test)), 5) print("predicted",";", "actual", " ;", "error") for i in ex: res = weighted_knn(tree, test_scaled[i].reshape(1, -1), train_price) print(res, " ;", test_price[i], " ;", abs(test_price[i] - res)) # predicted ; actual ; error # 446422.0 ; 399995 ; 46427.0 # 542199.0 ; 653500 ; 111301.0 # 331369.0 ; 360000 ; 28631.0 # 375849.0 ; 255000 ; 120849.0 # 633987.0 ; 687015 ; 53028.0
Algorithm’s RMSE.
print(testalgorithm(weighted_knn, tree, test_scaled, train_price, test_price)) #192420.0
Leave a Reply