ITACA_Insurace_NLP_v2

Runtime error

App Files Files Community

ITACA_Insurace_NLP_v2 / keyword_extraction.py

danielperales

Duplicate from dperales/ITACA_Insurance_Core_v4

27880c1 over 2 years ago

raw

history blame contribute delete

4.73 kB

	import nltk
	import pytextrank
	import re
	from operator import itemgetter
	import en_core_web_sm


	class KeywordExtractor:
	"""
	Keyword Extraction on text data
	Attributes:
	nlp: An instance English pipeline optimized for CPU for spacy
	"""

	def __init__(self):
	self.nlp = en_core_web_sm.load()
	self.nlp.add_pipe("textrank")

	def get_keywords(self, text, max_keywords):
	"""
	Extract keywords from text.
	Parameters:
	text (str): The user input string to extract keywords from
	Returns:
	kws (list): list of extracted keywords
	"""

	doc = self.nlp(text)

	kws = [i.text for i in doc._.phrases[:max_keywords]]

	return kws

	def get_keyword_indices(self, kws, text):
	"""
	Extract keywords from text.
	Parameters:
	kws (list): list of extracted keywords
	text (str): The user input string to extract keywords from
	Returns:
	keyword_indices (list): list of indices for keyword boundaries in text
	"""

	keyword_indices = []
	for s in kws:
	indices = [[m.start(), m.end()] for m in re.finditer(re.escape(s), text)]
	keyword_indices.extend(indices)

	return keyword_indices

	def merge_overlapping_indices(self, keyword_indices):
	"""
	Merge overlapping keyword indices.
	Parameters:
	keyword_indices (list): list of indices for keyword boundaries in text
	Returns:
	keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
	"""

	# Sort the array on the basis of start values of intervals.
	keyword_indices.sort()

	stack = []
	# insert first interval into stack
	stack.append(keyword_indices[0])
	for i in keyword_indices[1:]:
	# Check for overlapping interval,
	# if interval overlap
	if (stack[-1][0] <= i[0] <= stack[-1][-1]) or (stack[-1][-1] == i[0]-1):
	stack[-1][-1] = max(stack[-1][-1], i[-1])
	else:
	stack.append(i)
	return stack

	def merge_until_finished(self, keyword_indices):
	"""
	Loop until no overlapping keyword indices left.
	Parameters:
	keyword_indices (list): list of indices for keyword boundaries in text
	Returns:
	keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
	"""

	len_indices = 0
	while True:
	# Merge overlapping indices
	merged = self.merge_overlapping_indices(keyword_indices)
	# Check to see if merging reduced number of annotation indices
	# If merging did not reduce list return final indicies
	if len_indices == len(merged):
	out_indices = sorted(merged, key=itemgetter(0))
	return out_indices
	else:
	len_indices = len(merged)

	def get_annotation(self, text, keyword_indices):
	"""
	Create text annotation for extracted keywords.
	Parameters:
	keyword_indices (list): list of indices for keyword boundaries in text
	Returns:
	annotation (list): list of tuples for generating html
	"""

	# Turn list to numpy array
	arr = list(text)

	# Loop through indices in list and insert delimeters
	for idx in sorted(keyword_indices, reverse=True):
	arr.insert(idx[0], "<kw>")
	arr.insert(idx[1]+1, "<!kw> <kw>")

	# join array
	joined_annotation = ''.join(arr)

	# split array on delimeter
	split = joined_annotation.split('<kw>')

	# Create annotation for keywords in text
	annotation = [(x.replace('<!kw> ', ''), "KEY", "#26aaef") if "<!kw>" in x else x for x in split]

	return annotation

	def generate(self, text, max_keywords):
	"""
	Create text annotation for extracted keywords.
	Parameters:
	text (str): The user input string to extract keywords from
	max_keywords (int): Limit on number of keywords to generate
	Returns:
	annotation (list): list of tuples for generating html
	kws (list): list of extracted keywords
	"""

	kws = self.get_keywords(text, max_keywords)

	indices = list(self.get_keyword_indices(kws, text))
	if indices:
	indices_merged = self.merge_until_finished(indices)
	annotation = self.get_annotation(text, indices_merged)
	else:
	annotation = None

	return annotation, kws