-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFeature_Generation.py
More file actions
90 lines (67 loc) · 2.75 KB
/
Feature_Generation.py
File metadata and controls
90 lines (67 loc) · 2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import pandas as pd
import random
from math import sin, cos, sqrt, atan2, radians
import sys
def calculate_distance_to_next_stop(train_data,bus_stop_data):
# approximate radius of earth in km
R = 6373.0
min_distance = sys.maxsize
for index, row in train_data.iterrows():
lat1 = radians(row['Stop Latitude'])
lon1 = radians(row['Stop Longitude'])
lat2 = radians(bus_stop_data['Stop Latitude'])
lon2 = radians(bus_stop_data['Stop Longitude'])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
if distance != 0 and distance<min_distance:
min_distance = distance
return min_distance
def calculate_distance_from_suburb_center(suburb_data,bus_stop_data):
# approximate radius of earth in km
R = 6373.0
min_distance=sys.maxsize
lat1 = radians(suburb_data['latitude'])
lon1 = radians(suburb_data['longitude'])
lat2 = radians(bus_stop_data['Stop Latitude'])
lon2 = radians(bus_stop_data['Stop Longitude'])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
if distance != 0 and distance < min_distance:
min_distance = distance
return min_distance
def select_bus_stops(data):
final_bus_stops=[]
df=data.groupby(['Suburb'])
for name,group in df:
if len(group)>3:
for i in range(0,3):
final_bus_stops.append(list(group.iloc[i, :].values))
final_df = pd.DataFrame(final_bus_stops,
columns=['Stop ID', 'Stop Latitude', 'Stop Longitude', 'Stop Name', 'Suburb', 'Location'])
return final_df
data = pd.read_csv('./Data/Bus_Stops.csv')
final_df = select_bus_stops(data)
print(final_df.head())
min_distance_stop = []
for index, row in final_df.iterrows():
min_distance_stop.append(calculate_distance_to_next_stop(data, row))
final_df['distance_next_stop'] = pd.Series(min_distance_stop)
print(final_df.head())
suburb_desc = pd.read_csv('./Data/suburb_desc.csv')
print(suburb_desc['input_string'].values)
distance_from_suburb_center = list()
for index,row in final_df.iterrows():
if row['Suburb'] in suburb_desc['input_string'].values:
suburb_data = suburb_desc.loc[suburb_desc['input_string'] == row['Suburb']]
distance_from_suburb_center.append(calculate_distance_from_suburb_center(suburb_data,row))
else:
distance_from_suburb_center.append(0)
final_df['distance_suburb_center'] = pd.Series(distance_from_suburb_center)
print(final_df.head())
final_df.to_csv('bus_stop_data.csv' , encoding='utf-8')