Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

69 74 highlighting syns stems bug fixes #510

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 65 additions & 32 deletions api/namex/analytics/solr.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,16 @@ def get_conflict_results(cls, name, bucket, start=0, rows=100):
# handle non-ascii chars in name
name = ''.join([i if ord(i) < 128 else parse.quote(i) for i in name])
name = cls.remove_stopwords_designations(name)
list_name_split = name.split()

if name.find('*') != -1:
list_name_split = name.split()
else:
list_name_split,name = cls.combine_multi_word_synonyms(name, solr_base_url)
list_name_split = [x.upper() for x in list_name_split]

prox_search_strs,old_alg_search_strs,phon_search_strs = cls.build_solr_search_strs(name, list_name_split)
synonyms_for_word = cls.get_synonyms_for_words(list_name_split)

synonyms_for_word = cls.get_synonyms_for_words(list_name_split)
if bucket == 'synonym':
connections = cls.get_synonym_results(solr_base_url, name, prox_search_strs, old_alg_search_strs, start, rows)

Expand Down Expand Up @@ -177,24 +182,25 @@ def get_conflict_results(cls, name, bucket, start=0, rows=100):
synonyms = result_name[result_name.find('(') + 1:result_name.find(')')]
synonyms = [x.strip() for x in synonyms.split(',')]
for synonym in synonyms:
processed_synonyms_dict = cls.word_pre_processing(synonyms_for_word[synonym.upper()],
'synonyms',
solr_base_url
)
for word in processed_synonyms_dict:
for item in result['response']['docs']:
if item['name'] not in seen_ordered_names and item['name'] not in missed_names:
missed_names.append(item['name'])
if item['name'] not in seen_ordered_names:
processed_name = cls.name_pre_processing(item['name']).upper()
if ' ' + processed_synonyms_dict[word].upper() in ' ' + processed_name.upper():
seen_ordered_names.append(item['name'])
ordered_names.append({'name_info':item, 'stems': [processed_synonyms_dict[word].upper()]})
missed_names.remove(item['name'])
elif ' ' + word.upper() in ' ' + processed_name.upper():
seen_ordered_names.append(item['name'])
ordered_names.append({'name_info': item, 'stems': [word.upper()]})
missed_names.remove(item['name'])
if synonym.upper() in synonyms_for_word:
processed_synonyms_dict = cls.word_pre_processing(synonyms_for_word[synonym.upper()],
'synonyms',
solr_base_url
)
for word in processed_synonyms_dict:
for item in result['response']['docs']:
if item['name'] not in seen_ordered_names and item['name'] not in missed_names:
missed_names.append(item['name'])
if item['name'] not in seen_ordered_names:
processed_name = cls.name_pre_processing(item['name']).upper()
if ' ' + processed_synonyms_dict[word].upper() in ' ' + processed_name.upper():
seen_ordered_names.append(item['name'])
ordered_names.append({'name_info':item, 'stems': [processed_synonyms_dict[word].upper()]})
missed_names.remove(item['name'])
elif ' ' + word.upper() in ' ' + processed_name.upper():
seen_ordered_names.append(item['name'])
ordered_names.append({'name_info': item, 'stems': [word.upper()]})
missed_names.remove(item['name'])

else:
for item in result['response']['docs']:
Expand All @@ -205,9 +211,7 @@ def get_conflict_results(cls, name, bucket, start=0, rows=100):
final_names_list = []

# order based on alphabetization of swapped in synonyms

if bucket == 'synonym':

processed_words_dict = cls.word_pre_processing(list_name_split, 'synonyms', solr_base_url)

pivot_list = []
Expand Down Expand Up @@ -454,10 +458,10 @@ def _compress_name(cls, name):

# TODO: these should be loaded from somewhere.
designations = [
'corp.', 'corporation', 'inc.', 'incorporated', 'incorporee', 'l.l.c.', 'limited', 'limited liability co.',
'limited liability company', 'limited liability partnership', 'limitee', 'llc', 'llp', 'ltd.', 'ltee',
'corp.', 'corporation', 'inc.', 'incorporated', 'incorporee', 'l.l.c.', 'limited liability co.',
'limited liability company', 'limited liability partnership', 'limited partnership','limitee', 'llc', 'llp', 'ltd.', 'ltee',
'sencrl', 'societe a responsabilite limitee', 'societe en nom collectif a responsabilite limitee', 'srl',
'ulc', 'unlimited liability company']
'ulc', 'unlimited liability company', 'limited',]

# Match the designation with whitespace before and either followed by whitespace or end of line.
for designation in designations:
Expand Down Expand Up @@ -636,7 +640,7 @@ def _get_synonym_list(cls, token):
# Look up each token in name, and if it is in the synonyms then we need to search for it separately.
@classmethod
def _get_synonyms_clause(cls, name):
name = re.sub(' +', ' ', name)
# name = re.sub(' +', ' ', name)
current_app.logger.debug('getting synonyms for: {}'.format(name))
clause = ''
synonyms = []
Expand Down Expand Up @@ -719,10 +723,10 @@ def _get_name_copy_clause(cls, name):
def remove_stopwords_designations(cls, name):
# TODO: these should be loaded from somewhere.
designations = [
'corp.', 'corporation', 'inc.', 'incorporated', 'incorporee', 'l.l.c.', 'limited',
'limited liability co.', 'limited liability company', 'limited liability partnership', 'limitee', 'llc',
'llp', 'ltd.', 'ltee', 'sencrl', 'societe a responsabilite limitee',
'societe en nom collectif a responsabilite limitee', 'srl', 'ulc', 'unlimited liability company']
'corp.', 'corp', 'corporation', 'inc.', 'inc', 'incorporated', 'incorporee', 'l.l.c.', 'llc', 'limited partnership',
'limited liability co.', 'limited liability co','limited liability company', 'limited liability partnership', 'limitee',
'llp', 'ltd.', 'ltd', 'ltee', 'sencrl', 'societe a responsabilite limitee',
'societe en nom collectif a responsabilite limitee', 'limited', 'srl', 'ulc', 'unlimited liability company']

stop_words = []
try:
Expand Down Expand Up @@ -751,6 +755,36 @@ def remove_stopwords_designations(cls, name):
name = name.upper().replace(' AND ', ' ').replace('&', ' ').replace('+', ' ')
return name

@classmethod
def combine_multi_word_synonyms(cls, name, solr_base_url):

max_len = len(name.split()) * 2
query = solr_base_url + \
'/solr/possible.conflicts/analysis/field?analysis.fieldvalue={name}&analysis.fieldname=name' \
'&wt=json&indent=true'.format(name=parse.quote(name.strip()).replace('%2A', ''))

processed_words = json.load(request.urlopen(query))

count = 0
for item in processed_words['analysis']['field_names']['name']['index']:
if item == 'org.apache.lucene.analysis.synonym.SynonymGraphFilter':
count += 1
break
count += 1

name = ''
word_count = 0
for text in processed_words['analysis']['field_names']['name']['index'][count]:
if word_count < max_len:
name += text['text'] + ' '
else:
name += text['text']
word_count += 1
name = parse.unquote(name)
processed_list = name.split()

return processed_list,name.strip()

@classmethod
def build_solr_search_strs(cls, name, list_name_split):
def replace_nth(string, deleted_substr, added_substr, n):
Expand Down Expand Up @@ -795,7 +829,7 @@ def replace_nth(string, deleted_substr, added_substr, n):
@classmethod
def get_synonyms_for_words(cls, list_name_split):
# get synonym list for each word in the name
list_name_split = [w.replace('*','') for w in list_name_split]
list_name_split = [wrd.replace('*','').upper() for wrd in list_name_split]
synonyms_for_word = {}
for word in list_name_split:
synonyms_for_word[word] = [x.upper().strip() for x in cls._get_synonym_list(word)]
Expand All @@ -820,7 +854,6 @@ def get_synonyms_for_words(cls, list_name_split):

@classmethod
def word_pre_processing(cls, list_of_words, type, solr_base_url):

list_of_words = [w.replace('*', '') for w in list_of_words]
words_to_process = ''
for item in list_of_words:
Expand Down
16 changes: 9 additions & 7 deletions api/tests/python/end_points/test_synonym_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,8 @@ def test_duplicated_letters(client, jwt, app):
("Jameisons four two zero process server", '----JAMEISONS FOUR TWO synonyms:(process, server, processserver) - PROXIMITY SEARCH'),
("Jameisons four two zero process server", '----JAMEISONS FOUR synonyms:(two, process, server, twozero, processserver) - PROXIMITY SEARCH'),
("Jameisons four two zero process server", '----JAMEISONS synonyms:(four, two, process, server, fourtwo, fourtwozero, twozero, processserver) - PROXIMITY SEARCH'),
("pacific real estate", "----PACIFIC REALESTATE - PROXIMITY SEARCH"),
("pacific real estate", "----PACIFIC synonyms:(realestate) - PROXIMITY SEARCH")
])
def test_multi_word_synonyms(client, jwt, app, criteria, seed):
verify_synonym_match(client, jwt,
Expand Down Expand Up @@ -456,13 +458,13 @@ def test_exact_word_order_stack_title_with_wilcards(client, jwt, app):
@integration_synonym_api
@integration_solr
@pytest.mark.parametrize("criteria, seed", [
('TJ´S BACKCOUNTRY ADVENTURES.', '----TJ´S BACKCOUNTRY ADVENTURES. - PROXIMITY SEARCH'),
('HOUSE BÜBÜ DA WOLF.', '----HOUSE BÜBÜ DA WOLF. - PROXIMITY SEARCH'),
('DIAMANTÉ DIAMOND SETTING', '----DIAMANTÉ DIAMOND SETTING - PROXIMITY SEARCH'),
('MICHELLE¿S BEAR ESSENTIALS.', '----MICHELLE¿S BEAR ESSENTIALS. - PROXIMITY SEARCH'),
('TEST àâçèéêëîïôöùûü BEAR.', '----TEST àâçèéêëîïôöùûü BEAR. - PROXIMITY SEARCH'),
('TEST ÀÂÇÈÉÊËÎÏÔÖÙÛÜS BEAR.', '----TEST ÀÂÇÈÉÊËÎÏÔÖÙÛÜS BEAR. - PROXIMITY SEARCH'),
('TEST °£÷¥·©§¶¼½`¾¢!¦«ª¡¹²³»¿¬±¤®× BEAR.', '----TEST °£÷¥·©§¶¼½`¾¢!¦«ª¡¹²³»¿¬±¤®× BEAR. - PROXIMITY SEARCH'),
('TJ´S BACKCOUNTRY ADVENTURES.', '----TJC 2 B 4 S BACKCOUNTRY ADVENTURES - PROXIMITY SEARCH'),
('HOUSE BÜBÜ DA WOLF.', '----HOUSE BC 39 CBC 39 C DA WOLF - PROXIMITY SEARCH'),
('DIAMANTÉ DIAMOND SETTING', '----DIAMANTC 389 DIAMOND SETTING - PROXIMITY SEARCH'),
('MICHELLE¿S BEAR ESSENTIALS.', '----MICHELLEC 2 BFS BEAR ESSENTIALS - PROXIMITY SEARCH'),
('TEST àâçèéêëîïôöùûü BEAR.', '----TEST C 380 C 382 C 387C388C389C38AC38BC38EC38FC394C396C399C39BC39CBEAR - PROXIMITY SEARCH'),
('TEST ÀÂÇÈÉÊËÎÏÔÖÙÛÜS BEAR.', '----TEST C 380 C 382 C 387C388C389C38AC38BC38EC38FC394C396C399C39BC39CSBEAR - PROXIMITY SEARCH'),
('TEST °£÷¥·©§¶¼½`¾¢!¦«ª¡¹²³»¿¬±¤®× BEAR.', '----TEST C 2 B 0 C 2A3C3B7C2A5C2B7C2A9C2A7C2B6C2BCC2BDC2BEC2A2C2A6C2ABC2AAC2A1C2B9C2B2C2B3C2BBC2BFC2ACC2B1C2A4C2AEC397BEAR - PROXIMITY SEARCH'),
])
def test_bypass_nonascii_characters(client, jwt, app, criteria, seed):
verify_synonym_match(client,
Expand Down