Skip to content

Commit d28c406

Browse files
committed
v1.0.5
1 parent 42fbc56 commit d28c406

6 files changed

Lines changed: 161 additions & 106 deletions

File tree

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
name='uaddress',
1212
packages=['uaddress'],
1313
description='Ukrainian address parser',
14-
version='1.0.4',
14+
version='1.0.5',
1515
author='Evgen Kytonin',
1616
author_email='killfess@gmail.com',
1717
license='MIT',

training/data.xml

Lines changed: 42 additions & 0 deletions
Large diffs are not rendered by default.

uaddress/__init__.py

Lines changed: 17 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
import re
77
import argparse
88

9+
from .labels import LABELS
10+
from .types import TYPES
11+
912
try:
1013
from collections import OrderedDict
1114
except ImportError:
@@ -15,116 +18,13 @@
1518
import pycrfsuite
1619
import probableparsing
1720

18-
# The address components
19-
20-
LABELS = [
21-
22-
'Country',
23-
'RegionType',
24-
'Region',
25-
'CountyType',
26-
'County',
27-
'SubLocalityType',
28-
'SubLocality',
29-
'LocalityType',
30-
'Locality',
31-
'StreetType',
32-
'Street',
33-
'HousingType',
34-
'Housing',
35-
'HostelType',
36-
'Hostel',
37-
'HouseNumberType',
38-
'HouseNumber',
39-
'HouseNumberAdditionally',
40-
'SectionType',
41-
'Section',
42-
'ApartmentType',
43-
'Apartment',
44-
'RoomType',
45-
'Room',
46-
'Sector',
47-
'EntranceType',
48-
'Entrance',
49-
'FloorType',
50-
'Floor',
51-
'PostCode',
52-
'Manually',
53-
'NotAddress',
54-
'Comment',
55-
'AdditionalData'
56-
57-
]
58-
5921
PARENT_LABEL = 'AddressString'
6022
GROUP_LABEL = 'AddressCollection'
6123

6224
MODEL_FILE = 'uaddr.crfsuite'
6325
MODEL_FILES = ''
6426
backupModel = False
6527

66-
TYPES = {
67-
68-
# REGION
69-
"обл", "обл.", "область",
70-
# CITY
71-
"м", "м.", "місто", "г", "г.", "город",
72-
# DISTRICT
73-
"р-н", "р-н.", "рн", "рн.", "р-он", "район",
74-
# MICRODISTRICT
75-
"мікр", "мікр.", "мн", "мр", "мкрн", "мкр", "мікрорайон", "микр", "микр.", "микрорайон", "м-н",
76-
# VILLAGE
77-
"пос", "пос.", "смт", "смт.", "с.м.т", "пгт", "п г т", "пгт.", "село", "селище", "поселок", "с-ще",
78-
# STREET
79-
"вул", "вул.", "вулиця", "ул", "ул.", "улица", "влу.", "в.", "вулю",
80-
"пров", "пров.", "провулок", "пер", "пер.", "переулок", "прос", "провул", "прв.", "перевуло", "про",
81-
"бул", "бул.", "б-р", "бр", "бр.", "бур", "бур.", "бульвар", "бульв.", "бул-р.",
82-
"просп", "просп.", "прт", "прт.", "прокт", "прокт.", "пр", "пр.", "п-т", "п-т.", "п-рт.", "проспект", "п-т", "пр-кт", "пр-к",
83-
"ж\м" , "масив", "массив", "житловий масив", "жилой массив", "ж.м.",
84-
"ш.", "шосе", "шоссе",
85-
"алея", "аллея",
86-
"майд", "майд.", "майдан",
87-
"розвилка", "развилка",
88-
"узвіз", "спуск",
89-
"проїзд", "проезд",
90-
"дорога",
91-
"наб", "наб.", "набер.", "набережна", "набережная",
92-
"парк",
93-
"сквер",
94-
"тупик",
95-
"прохід", "проход",
96-
"ст", "ст.", "станція", "станция",
97-
"остр", "остр.", "острів", "остров",
98-
"шлях", "путь",
99-
"гай", "роща",
100-
"пл", "пл.", "площа", "площадь",
101-
"в'їзд", "въезд",
102-
"лінія", "линия",
103-
"траса", "трасса",
104-
"урочище",
105-
"шахта",
106-
"хутір", "хутор",
107-
"роз'їзд", "разъезд",
108-
"квартал",
109-
# HOUSING
110-
"корп.", "корп", "корпус",
111-
# HOSTEL
112-
"гурт", "гурт.", "гуртожиток", "общ", "общ.", "общежитие",
113-
# HOUSE
114-
"буд.", "будинок", "дом", "д.", "б.",
115-
# APARTMENT
116-
"кв.", "квартира",
117-
# ROOM
118-
"прим.",
119-
# SECTION
120-
"секція",
121-
# ENTRANCE
122-
"підʼїзд", "подъезд",
123-
# FLOOR
124-
"поверх", "этаж"
125-
126-
}
127-
12828
regex_tokens = r"\w+(?:\s|\.?)\-(?:\s)\w+|\([0-9а-яА-ЯіІїЇґҐ].*?\)|\(*\b[^\s,;#&]+[.)]*|\/\d+|[№][0-9]*"
12929

13030
try:
@@ -166,7 +66,7 @@ def tokenize(address_string):
16666
re_tokens = re.compile(regex_tokens, re.VERBOSE | re.UNICODE)
16767

16868
address_string = re.sub(r'\s+', ' ', address_string)
169-
address_string = unMergeType(address_string)
69+
address_string = fixString(address_string)
17070

17171
tokens = re_tokens.findall(address_string)
17272

@@ -175,7 +75,7 @@ def tokenize(address_string):
17575

17676
return tokens
17777

178-
def unMergeType(address):
78+
def fixString(address):
17979

18080
##
18181
# Unmerge string, if exists housenumber + apartment type
@@ -213,6 +113,18 @@ def unMergeType(address):
213113
if re.findall(r'(?<=[a-zA-Zа-яА-ЯіІїЇґҐ])\(', address):
214114
address = re.sub(r'(?<=[a-zA-Zа-яА-ЯіІїЇґҐ])\(', ' (', address)
215115

116+
##
117+
# REMOVE SPACE AFTER HYPHEN
118+
#
119+
if re.findall(r'\-\s+', address):
120+
address = re.sub('\-\s+', '-', address)
121+
122+
##
123+
# ADD SPACE BEFORE AND AFTER BRACKET
124+
#
125+
if re.findall(r'(?=\()|(?<=\))', address):
126+
address = re.sub('(?=\()|(?<=\))', ' ', address)
127+
216128
return address
217129

218130
def tokenFeatures(token):

uaddress/labels.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# The address components
2+
3+
LABELS = [
4+
5+
'Country',
6+
'RegionType',
7+
'Region',
8+
'CountyType',
9+
'County',
10+
'SubLocalityType',
11+
'SubLocality',
12+
'LocalityType',
13+
'Locality',
14+
'StreetType',
15+
'Street',
16+
'HousingType',
17+
'Housing',
18+
'HostelType',
19+
'Hostel',
20+
'HouseNumberType',
21+
'HouseNumber',
22+
'HouseNumberAdditionally',
23+
'SectionType',
24+
'Section',
25+
'ApartmentType',
26+
'Apartment',
27+
'RoomType',
28+
'Room',
29+
'Sector',
30+
'EntranceType',
31+
'Entrance',
32+
'FloorType',
33+
'Floor',
34+
'PostCode',
35+
'Manually',
36+
'NotAddress',
37+
'Comment',
38+
'AdditionalData'
39+
40+
]

uaddress/types.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
TYPES = {
2+
3+
# REGION
4+
"обл", "обл.", "область",
5+
# CITY
6+
"м", "м.", "місто", "г", "г.", "город",
7+
# DISTRICT
8+
"р-н", "р-н.", "рн", "рн.", "р-он", "район",
9+
# MICRODISTRICT
10+
"мікр", "мікр.", "мн", "мр", "мкрн", "мкр", "мікрорайон", "микр", "микр.", "микрорайон", "м-н",
11+
# VILLAGE
12+
"пос", "пос.", "смт", "смт.", "с.м.т", "пгт", "п г т", "пгт.", "село", "селище", "поселок", "с-ще",
13+
# STREET
14+
"вул", "вул.", "вулиця", "ул", "ул.", "улица", "влу.", "в.", "вулю", "улиця",
15+
"пров", "пров.", "провулок", "пер", "пер.", "переулок", "прос", "провул", "прв.", "перевуло", "про",
16+
"бул", "бул.", "б-р", "бр", "бр.", "бур", "бур.", "бульвар", "бульв.", "бул-р.",
17+
"просп", "просп.", "прт", "прт.", "прокт", "прокт.", "пр", "пр.", "п-т", "п-т.", "п-рт.", "проспект", "п-т", "пр-кт", "пр-к", "прпосп"
18+
"ж\м" , "масив", "массив", "житловий масив", "жилой массив", "ж.м.",
19+
"ш.", "шосе", "шоссе",
20+
"алея", "аллея",
21+
"майд", "майд.", "майдан",
22+
"розвилка", "развилка",
23+
"узвіз", "спуск",
24+
"проїзд", "проезд",
25+
"дорога",
26+
"наб", "наб.", "набер.", "набережна", "набережная",
27+
"парк",
28+
"сквер",
29+
"тупик",
30+
"прохід", "проход",
31+
"ст", "ст.", "станція", "станция",
32+
"остр", "остр.", "острів", "остров",
33+
"шлях", "путь",
34+
"гай", "роща",
35+
"пл", "пл.", "площа", "площадь",
36+
"в'їзд", "въезд",
37+
"лінія", "линия",
38+
"траса", "трасса",
39+
"урочище",
40+
"шахта",
41+
"хутір", "хутор",
42+
"роз'їзд", "разъезд",
43+
"квартал",
44+
# HOUSING
45+
"корп.", "корп", "корпус",
46+
# HOSTEL
47+
"гурт", "гурт.", "гуртожиток", "общ", "общ.", "общежитие",
48+
# HOUSE
49+
"буд.", "будинок", "дом", "д.", "б.",
50+
# APARTMENT
51+
"кв.", "квартира",
52+
# ROOM
53+
"прим.",
54+
# SECTION
55+
"секція",
56+
# ENTRANCE
57+
"підʼїзд", "подъезд",
58+
# FLOOR
59+
"поверх", "этаж"
60+
61+
}

uaddress/uaddr.crfsuite

8.38 KB
Binary file not shown.

0 commit comments

Comments
 (0)