-
Notifications
You must be signed in to change notification settings - Fork 7
/
post_analysis.py
40 lines (30 loc) · 990 Bytes
/
post_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pickle
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error
from matplotlib import style
style.use('ggplot')
#read data
df = pd.read_csv('post_analysis_data.csv')
#to numpy array
X = np.array(df.drop(['duration'], 1))
y = np.array(df['duration'])
#splitting training and testing data
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.2)
#Analysing the required number of trees for RandomForest
a = np.array([[10, 247]])
for i in range(20, 60, 10):
clf = RandomForestRegressor(n_estimators = i)
clf.fit(X_train, y_train)
y_actual = y_test
y_pred = clf.predict(X_test)
rms = sqrt(mean_squared_error(y_actual, y_pred))
a = np.append(a, [[i, rms]], axis = 0)
plt.plot(a[:, 0], a[: 1], linewidth = 2.0)
plt.show()