Source code for nlp.extractor

import os
import re
import operator


[docs]class NLPExtractor(object): def __init__(self): self.stop_words_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'stopwords.txt') self.stop_words_pattern = self.build_stop_word_regex()
[docs] def build_stop_word_regex(self): """ Creates stop word regex. :return: stop word pattern. """ stop_word_list = self.load_stop_words() stop_word_regex_list = [] for word in stop_word_list: word_regex = r'\b' + word + r'(?![\w-])' stop_word_regex_list.append(word_regex) stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE) return stop_word_pattern
[docs] def load_stop_words(self): """ Utility function to load stop words from a file and return as a list of words. :return: list A list of stop words. """ stop_words = [] for line in open(self.stop_words_path): if line.strip()[0:1] != "#": for word in line.split(): # in case more than one per line stop_words.append(word) return stop_words
@staticmethod
[docs] def is_number(word): """ Checks whether word is a number. :param str word: Word to be checked. :return: True or False """ try: float(word) if '.' in word else int(word) return True except ValueError: return False
@staticmethod
[docs] def separate_words(text, min_word_return_size): """ Utility function to return a list of all words that are have a length greater than a specified number of characters. :param str text: The text that must be split in to words. :param int min_word_return_size: The minimum no of characters a word \ must have to be included. :return: list of separated words. """ splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]') words = [] for single_word in splitter.split(text): current_word = single_word.strip().lower() # leave numbers in phrase, but don't count as words, # since they tend to invalidate scores of their phrases if len(current_word) > min_word_return_size and current_word != '' \ and not NLPExtractor.is_number(current_word): words.append(current_word) return words
@staticmethod
[docs] def split_sentences(text): """ Utility function to return a list of sentences. :param str text: The text that must be split in to sentences. :return: sentences List of sentences created due to split. """ sentence_delimiters = re.compile( u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s') sentences = sentence_delimiters.split(text) return sentences
@staticmethod
[docs] def generate_candidate_keywords(sentence_list, stopword_pattern): """ Generates list of keywords candidates. :param list sentence_list: List of sentences to be processed. :param str stopword_pattern: Stop words pattern. :return: list of keywords """ phrase_list = [] for s in sentence_list: tmp = re.sub(stopword_pattern, '|', s.strip()) phrases = tmp.split("|") for phrase in phrases: phrase = phrase.strip().lower() if phrase != "": phrase_list.append(phrase) return phrase_list
@staticmethod
[docs] def calculate_word_scores(phrase_list): """ Calculates words scores based on their frequency and degree. :param list phrase_list: List of phrases to be processed. :return: mapping between word and its score. """ word_frequency = {} word_degree = {} for phrase in phrase_list: word_list = NLPExtractor.separate_words(phrase, 0) word_list_length = len(word_list) word_list_degree = word_list_length - 1 for word in word_list: word_frequency.setdefault(word, 0) word_frequency[word] += 1 word_degree.setdefault(word, 0) word_degree[word] += word_list_degree for item in word_frequency: word_degree[item] = word_degree[item] + word_frequency[item] word_score = {} for item in word_frequency: word_score.setdefault(item, 0) word_score[item] = word_degree[item] / ( word_frequency[item] * 1.0) return word_score
@staticmethod
[docs] def generate_candidate_keyword_scores(phrase_list, word_score): """ Generates scores for candidate keywords. :param list phrase_list: List of phrases to be processed. :param map word_score: Mapping between word and its score. :return: mapping between phrases and their scores. """ keyword_candidates = {} for phrase in phrase_list: keyword_candidates.setdefault(phrase, 0) word_list = NLPExtractor.separate_words(phrase, 0) candidate_score = 0 for word in word_list: candidate_score += word_score[word] keyword_candidates[phrase] = candidate_score return keyword_candidates
[docs] def run(self, text): """ Extracts keywords from the text. :param str text: Text to be processed. :return: list of keywords. """ sentence_list = NLPExtractor.split_sentences(text) phrase_list = NLPExtractor.generate_candidate_keywords( sentence_list, self.stop_words_pattern) word_scores = NLPExtractor.calculate_word_scores(phrase_list) keyword_candidates = NLPExtractor.generate_candidate_keyword_scores( phrase_list, word_scores) sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True) keywords = [] for keyword, _ in sorted_keywords: keywords.append(keyword) return keywords