66import re
77import argparse
88
9+ from .labels import LABELS
10+ from .types import TYPES
11+
912try :
1013 from collections import OrderedDict
1114except ImportError :
1518import pycrfsuite
1619import probableparsing
1720
18- # The address components
19-
20- LABELS = [
21-
22- 'Country' ,
23- 'RegionType' ,
24- 'Region' ,
25- 'CountyType' ,
26- 'County' ,
27- 'SubLocalityType' ,
28- 'SubLocality' ,
29- 'LocalityType' ,
30- 'Locality' ,
31- 'StreetType' ,
32- 'Street' ,
33- 'HousingType' ,
34- 'Housing' ,
35- 'HostelType' ,
36- 'Hostel' ,
37- 'HouseNumberType' ,
38- 'HouseNumber' ,
39- 'HouseNumberAdditionally' ,
40- 'SectionType' ,
41- 'Section' ,
42- 'ApartmentType' ,
43- 'Apartment' ,
44- 'RoomType' ,
45- 'Room' ,
46- 'Sector' ,
47- 'EntranceType' ,
48- 'Entrance' ,
49- 'FloorType' ,
50- 'Floor' ,
51- 'PostCode' ,
52- 'Manually' ,
53- 'NotAddress' ,
54- 'Comment' ,
55- 'AdditionalData'
56-
57- ]
58-
5921PARENT_LABEL = 'AddressString'
6022GROUP_LABEL = 'AddressCollection'
6123
6224MODEL_FILE = 'uaddr.crfsuite'
6325MODEL_FILES = ''
6426backupModel = False
6527
66- TYPES = {
67-
68- # REGION
69- "обл" , "обл." , "область" ,
70- # CITY
71- "м" , "м." , "місто" , "г" , "г." , "город" ,
72- # DISTRICT
73- "р-н" , "р-н." , "рн" , "рн." , "р-он" , "район" ,
74- # MICRODISTRICT
75- "мікр" , "мікр." , "мн" , "мр" , "мкрн" , "мкр" , "мікрорайон" , "микр" , "микр." , "микрорайон" , "м-н" ,
76- # VILLAGE
77- "пос" , "пос." , "смт" , "смт." , "с.м.т" , "пгт" , "п г т" , "пгт." , "село" , "селище" , "поселок" , "с-ще" ,
78- # STREET
79- "вул" , "вул." , "вулиця" , "ул" , "ул." , "улица" , "влу." , "в." , "вулю" ,
80- "пров" , "пров." , "провулок" , "пер" , "пер." , "переулок" , "прос" , "провул" , "прв." , "перевуло" , "про" ,
81- "бул" , "бул." , "б-р" , "бр" , "бр." , "бур" , "бур." , "бульвар" , "бульв." , "бул-р." ,
82- "просп" , "просп." , "прт" , "прт." , "прокт" , "прокт." , "пр" , "пр." , "п-т" , "п-т." , "п-рт." , "проспект" , "п-т" , "пр-кт" , "пр-к" ,
83- "ж\м" , "масив" , "массив" , "житловий масив" , "жилой массив" , "ж.м." ,
84- "ш." , "шосе" , "шоссе" ,
85- "алея" , "аллея" ,
86- "майд" , "майд." , "майдан" ,
87- "розвилка" , "развилка" ,
88- "узвіз" , "спуск" ,
89- "проїзд" , "проезд" ,
90- "дорога" ,
91- "наб" , "наб." , "набер." , "набережна" , "набережная" ,
92- "парк" ,
93- "сквер" ,
94- "тупик" ,
95- "прохід" , "проход" ,
96- "ст" , "ст." , "станція" , "станция" ,
97- "остр" , "остр." , "острів" , "остров" ,
98- "шлях" , "путь" ,
99- "гай" , "роща" ,
100- "пл" , "пл." , "площа" , "площадь" ,
101- "в'їзд" , "въезд" ,
102- "лінія" , "линия" ,
103- "траса" , "трасса" ,
104- "урочище" ,
105- "шахта" ,
106- "хутір" , "хутор" ,
107- "роз'їзд" , "разъезд" ,
108- "квартал" ,
109- # HOUSING
110- "корп." , "корп" , "корпус" ,
111- # HOSTEL
112- "гурт" , "гурт." , "гуртожиток" , "общ" , "общ." , "общежитие" ,
113- # HOUSE
114- "буд." , "будинок" , "дом" , "д." , "б." ,
115- # APARTMENT
116- "кв." , "квартира" ,
117- # ROOM
118- "прим." ,
119- # SECTION
120- "секція" ,
121- # ENTRANCE
122- "підʼїзд" , "подъезд" ,
123- # FLOOR
124- "поверх" , "этаж"
125-
126- }
127-
12828regex_tokens = r"\w+(?:\s|\.?)\-(?:\s)\w+|\([0-9а-яА-ЯіІїЇґҐ].*?\)|\(*\b[^\s,;#&]+[.)]*|\/\d+|[№][0-9]*"
12929
13030try :
@@ -166,7 +66,7 @@ def tokenize(address_string):
16666 re_tokens = re .compile (regex_tokens , re .VERBOSE | re .UNICODE )
16767
16868 address_string = re .sub (r'\s+' , ' ' , address_string )
169- address_string = unMergeType (address_string )
69+ address_string = fixString (address_string )
17070
17171 tokens = re_tokens .findall (address_string )
17272
@@ -175,7 +75,7 @@ def tokenize(address_string):
17575
17676 return tokens
17777
178- def unMergeType (address ):
78+ def fixString (address ):
17979
18080 ##
18181 # Unmerge string, if exists housenumber + apartment type
@@ -213,6 +113,18 @@ def unMergeType(address):
213113 if re .findall (r'(?<=[a-zA-Zа-яА-ЯіІїЇґҐ])\(' , address ):
214114 address = re .sub (r'(?<=[a-zA-Zа-яА-ЯіІїЇґҐ])\(' , ' (' , address )
215115
116+ ##
117+ # REMOVE SPACE AFTER HYPHEN
118+ #
119+ if re .findall (r'\-\s+' , address ):
120+ address = re .sub ('\-\s+' , '-' , address )
121+
122+ ##
123+ # ADD SPACE BEFORE AND AFTER BRACKET
124+ #
125+ if re .findall (r'(?=\()|(?<=\))' , address ):
126+ address = re .sub ('(?=\()|(?<=\))' , ' ' , address )
127+
216128 return address
217129
218130def tokenFeatures (token ):
0 commit comments