# 캐글 : https://www.kaggle.com/c/bike-sharing-demand
# 코드 참고
# https://github.com/corazzon/KaggleStruggle/blob/master/bike-sharing-demand/bike-sharing-demand-ensemble-model.ipynb

# 2011년에 세워진 자전거 스타트업.
# 2011년부터 성장을 거듭함 (count가 성장하는 추세임)

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# 노트북 안에 그래프를 그리기 위해
%matplotlib inline

# 그래프에서 격자로 숫자 범위가 눈에 잘 띄도록 ggplot 스타일 사용
plt.style.use('ggplot')

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
mpl.rcParams['axes.unicode_minus'] = False

import warnings  
warnings.filterwarnings('ignore')

train = pd.read_csv("data_bike/train.csv", parse_dates=['datetime'])
                      # parse_dates 옵션하면 date 형식이라고 알려줌
print(train.shape)
train.head(3)

train.info()

>>>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(8)
memory usage: 1020.7 KB

test = pd.read_csv('data_bike/test.csv', parse_dates=['datetime'])

print(test.shape)
test.head(3)

1 Feature Engineering

1.0.1 datetime 데이터 정리

# '년월일시분초' -> '년/월/일/시/분/초'로 열 추가
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['hour'] = train['datetime'].dt.hour
train['dayofweek'] = train['datetime'].dt.dayofweek

print(train.shape)
train.head(3)

test["year"] = test["datetime"].dt.year
test["month"] = test["datetime"].dt.month
test["hour"] = test["datetime"].dt.hour
test["dayofweek"] = test["datetime"].dt.dayofweek

print(test.shape)
test.head(3)

# 연속형 feature와 범주형 feature
# 범주형 feature의 type을 category로 변경 해 준다.
categorical_feature_names = ["season","holiday","workingday","weather",
                             "dayofweek","month","year","hour"]

간혹 범주형 피쳐들의 경우 원핫인코딩을 해주면 점수가 높아지는데 도움이 되기도 하는데,

여기의 범주형 피쳐들은 실제로 원핫인코딩을 해봐도 점수 상승에 도움이 되지 않았다.
그래서 여기서는 원핫인코딩을 하지는 않았다.

# 카테고리 범주의 콜롬들을 전부 카테고리로 바꾸자
for var in categorical_feature_names:
    train[var] = train[var].astype("category")
    test[var] = test[var].astype("category")

train.info()

>>>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  category      
 2   holiday     10886 non-null  category      
 3   workingday  10886 non-null  category      
 4   weather     10886 non-null  category      
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
 12  year        10886 non-null  category      
 13  month       10886 non-null  category      
 14  hour        10886 non-null  category      
 15  dayofweek   10886 non-null  category      
dtypes: category(8), datetime64[ns](1), float64(3), int64(4)
memory usage: 767.8 KB

2 Feature Selection

feature_names = ["season", "weather", "temp", "atemp", "humidity",
                 "year", "hour", "dayofweek", "holiday", "workingday"
                ]
feature_names

>>>
['season',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'year',
 'hour',
 'dayofweek',
 'holiday',
 'workingday']

X_train = train[feature_names]

print(X_train.shape)
X_train.head()

X_test = test[feature_names]

print(X_test.shape)
X_test.head()

label_name = "count"

y_train = train[label_name]

print(y_train.shape)
y_train.head()

>>>
(10886,)
0    16
1    40
2    32
3    13
4     1
Name: count, dtype: int64

2.0.1 평가(scoring)방식: rmsle 함수

from sklearn.metrics import make_scorer

def rmsle(predicted_values, actual_values, convertExp=True):

    if convertExp:
        predicted_values = np.exp(predicted_values),
        actual_values = np.exp(actual_values)
        
    # 넘파이로 배열 형태로 바꿔준다.
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값과 실제 값에 1을 더하고 로그를 씌워준다.
    # 값이 0일 수도 있어서 로그를 취했을 때 마이너스 무한대가 될 수도 있기 때문에 1을 더해 줌
    # 로그를 씌워주는 것은 정규분포로 만들어주기 위해
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    # 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 해준다.
    difference = log_predict - log_actual
    difference = np.square(difference)
    
    # 평균을 낸다.
    mean_difference = difference.mean()
    
    # 다시 루트를 씌운다.
    score = np.sqrt(mean_difference)
    
    return score

2.1 선형회귀 모델

선형회귀 또는 최소제곱법은 가장 간단하고 오래된 회귀용 선형 알고리즘
선형회귀는 예측과 훈련 세트에 있는 타깃 y사이의 평균제곱오차(MSE)를 최소화하는 파라미터 w와 b를 찾는다.
매개변수가 없는 것이 장점이지만, 모델의 복잡도를 제어할 수 없다는 단점이 있다.
아래 위키피디아에서 가져 온 그래프에서 파란선이 선형회귀 모델을 나타내고 빨간점들이 훈련데이터를 나타낸다.

2.2 RandomForestRegressor() 모델

from sklearn.ensemble import RandomForestRegressor
rfModel = RandomForestRegressor(n_estimators=100)

y_train_log = np.log1p(y_train)
# 종속변수가 한쪽으로 치우쳐저 있어서 log를 취한다.
rfModel.fit(X_train, y_train_log)

# preds 값은 log를 취한 값이 나온다.
preds = rfModel.predict(X_train)

# 모델을 만들 때 종속변수를 log 취했으므로, 예측값도 log 취한 값으로 나온다.
# 따라서 RMSLE로 평가할때, exp 지수를 곱하여 다시 원상복귀 시켜야 한다.
score = rmsle(np.exp(y_train_log), np.exp(preds),False)

print ("RMSLE Value For Random Forest: ",score)


>>>
RMSLE Value For Random Forest:  0.10756360451030088

predsTest = rfModel.predict(X_test)

# train셋과 test셋의 count 분포 확인 -> 비슷하게 나온다
fig,(ax1,ax2)= plt.subplots(ncols=2)
fig.set_size_inches(12,5)
sns.distplot(y_train,ax=ax1,bins=50)
sns.distplot(np.exp(predsTest),ax=ax2,bins=50)

3 Submit

submission = pd.read_csv("data_bike/sampleSubmission.csv")
submission

# 앞서 log를 씌웠기 때문에 다시 지수(exp)을 씌워준다.
submission["count"] = np.exp(predsTest)

print(submission.shape)
submission.head(10)

submission.to_csv("data_bike/Score_{0:.5f}_submission.csv".format(score), index=False)

# Score_0.10711_submission.csv 파일 제출
225/3242  # 상위 7%에 해당하는 성적
>>> 
0.06940160394818014

4 결론

- 랜덤포레스트 회귀 사용
- 특정한 피쳐들 선택(특별한 피쳐 엔지니어링 안했음)

이렇게만 해도 상위 6%에 해당하는 성적을 얻을 수 있다!

저작자표시 비영리 변경금지 (새창열림)

'Machine Learning > 머신러닝 완벽가이드 for Python' 카테고리의 다른 글

ch6.2 PCA (0)	2022.10.20
ch6. 차원 축소 (0)	2022.10.20
예제 1-1. bike-sharing-demand_EDA (0)	2022.10.13
ch 5.7_로지스틱 회귀_ 5.8_회귀 트리 (실습) (0)	2022.10.13
ch.5.8 회귀 트리 (0)	2022.10.13

관리회계 & 데이터 분석 스터디

예제 1-2. bike-sharing-demand_랜덤포레스트회귀

1 Feature Engineering

1.0.1 datetime 데이터 정리

2 Feature Selection

2.0.1 평가(scoring)방식: rmsle 함수

2.1 선형회귀 모델

2.2 RandomForestRegressor() 모델

3 Submit

4 결론

'Machine Learning > 머신러닝 완벽가이드 for Python' 카테고리의 다른 글

티스토리툴바

예제 1-2. bike-sharing-demand_랜덤포레스트회귀

1 Feature Engineering

1.0.1 datetime 데이터 정리

2 Feature Selection

2.0.1 평가(scoring)방식: rmsle 함수

2.1 선형회귀 모델

2.2 RandomForestRegressor() 모델

3 Submit

4 결론

'Machine Learning > 머신러닝 완벽가이드 for Python' 카테고리의 다른 글

'Machine Learning/머신러닝 완벽가이드 for Python' Related Articles

티스토리툴바