-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_prep.py
More file actions
68 lines (57 loc) · 2.05 KB
/
data_prep.py
File metadata and controls
68 lines (57 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import logging
from utils import data, plotting
from config import Config
cfg = Config()
logging.basicConfig(level=cfg.log_level)
logger = logging.getLogger(__name__)
df = data.load_and_clean_obs_data()
print(df.info())
keep_cols = ['uuid',
'filename',
'observed_on_dt',
'observed_on_month',
'observed_on_day',
'observed_on_year',
'time_observed_at_dt',
'time_observed_at_hour',
'time_observed_at_minute',
'time_observed_at_second',
'time_zone',
'large_image_url',
'num_identification_agreements',
'num_identification_disagreements',
'latitude',
'longitude',
'taxon_id',
'genus',
'scientific_name',
'morphology'
]
df = df[keep_cols].copy()
df = df.sort_values(by=['scientific_name'], ascending=False)
print(df.head())
logger.info('Number of unique genus values: %d', df['genus'].nunique())
# Plotting
plotting.plot_class_distribution(df)
plotting.plot_time(df, column='observed_on_day', type='Day')
plotting.plot_time(df, column='observed_on_month', type='Month')
plotting.plot_time(df, column='observed_on_year', type='Year')
plotting.plot_location(df, facet_type='facetted')
plotting.plot_location(df, facet_type='non-facetted')
# Download images
failed_uuids = data.save_imgs(df)
if failed_uuids and len(failed_uuids) > 0:
logger.info('Failed to download images for UUIDs: %s', failed_uuids)
df = df[~df['uuid'].isin(failed_uuids)].reset_index(drop=True)
# Save counts of observations
if cfg.save_counts:
data.save_counts(df, col='scientific_name')
data.save_counts(df, col='genus')
# Save location data
if cfg.save_location:
df[['filename', 'latitude', 'longitude']].copy().to_csv(
cfg.EDA_dir/'location.csv', index=False)
# Saving cleaned results
df.to_csv(cfg.data_dir / 'obs_data_cleaned.csv', index=False)
# Split data into train, validation, and test sets
data.train_test_split(overwrite=True)