Source code for nlp.extractor

import os
import re
import operator


[docs]class NLPExtractor(object):
    def __init__(self):
        self.stop_words_path = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            'stopwords.txt')
        self.stop_words_pattern = self.build_stop_word_regex()

[docs]    def build_stop_word_regex(self):
        """
        Creates stop word regex.

        :return: stop word pattern.
        """
        stop_word_list = self.load_stop_words()
        stop_word_regex_list = []
        for word in stop_word_list:
            word_regex = r'\b' + word + r'(?![\w-])'
            stop_word_regex_list.append(word_regex)
        stop_word_pattern = re.compile('|'.join(stop_word_regex_list),
                                       re.IGNORECASE)
        return stop_word_pattern

[docs]    def load_stop_words(self):
        """
        Utility function to load stop words from a file and return as a list of
        words.

        :return: list A list of stop words.
        """
        stop_words = []
        for line in open(self.stop_words_path):
            if line.strip()[0:1] != "#":
                for word in line.split():  # in case more than one per line
                    stop_words.append(word)
        return stop_words

    @staticmethod
[docs]    def is_number(word):
        """
        Checks whether word is a number.

        :param str word: Word to be checked.
        :return: True or False
        """
        try:
            float(word) if '.' in word else int(word)
            return True
        except ValueError:
            return False

    @staticmethod
[docs]    def separate_words(text, min_word_return_size):
        """
        Utility function to return a list of all words that are have a length
        greater than a specified number of characters.

        :param str text: The text that must be split in to words.
        :param int min_word_return_size: The minimum no of characters a word \
        must have to be included.
        :return: list of separated words.
        """
        splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
        words = []
        for single_word in splitter.split(text):
            current_word = single_word.strip().lower()
            # leave numbers in phrase, but don't count as words,
            # since they tend to invalidate scores of their phrases
            if len(current_word) > min_word_return_size and current_word != '' \
                    and not NLPExtractor.is_number(current_word):
                words.append(current_word)
        return words

    @staticmethod
[docs]    def split_sentences(text):
        """
        Utility function to return a list of sentences.

        :param str text: The text that must be split in to sentences.
        :return: sentences List of sentences created due to split.
        """
        sentence_delimiters = re.compile(
            u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
        sentences = sentence_delimiters.split(text)
        return sentences

    @staticmethod
[docs]    def generate_candidate_keywords(sentence_list, stopword_pattern):
        """
        Generates list of keywords candidates.

        :param list sentence_list: List of sentences to be processed.
        :param str stopword_pattern: Stop words pattern.
        :return: list of keywords
        """
        phrase_list = []
        for s in sentence_list:
            tmp = re.sub(stopword_pattern, '|', s.strip())
            phrases = tmp.split("|")
            for phrase in phrases:
                phrase = phrase.strip().lower()
                if phrase != "":
                    phrase_list.append(phrase)
        return phrase_list

    @staticmethod
[docs]    def calculate_word_scores(phrase_list):
        """
        Calculates words scores based on their frequency and degree.

        :param list phrase_list: List of phrases to be processed.
        :return: mapping between word and its score.
        """
        word_frequency = {}
        word_degree = {}
        for phrase in phrase_list:
            word_list = NLPExtractor.separate_words(phrase, 0)
            word_list_length = len(word_list)
            word_list_degree = word_list_length - 1
            for word in word_list:
                word_frequency.setdefault(word, 0)
                word_frequency[word] += 1
                word_degree.setdefault(word, 0)
                word_degree[word] += word_list_degree
        for item in word_frequency:
            word_degree[item] = word_degree[item] + word_frequency[item]

        word_score = {}
        for item in word_frequency:
            word_score.setdefault(item, 0)
            word_score[item] = word_degree[item] / (
                word_frequency[item] * 1.0)
        return word_score

    @staticmethod
[docs]    def generate_candidate_keyword_scores(phrase_list, word_score):
        """
        Generates scores for candidate keywords.

        :param list phrase_list: List of phrases to be processed.
        :param map word_score: Mapping between word and its score.
        :return: mapping between phrases and their scores.
        """
        keyword_candidates = {}
        for phrase in phrase_list:
            keyword_candidates.setdefault(phrase, 0)
            word_list = NLPExtractor.separate_words(phrase, 0)
            candidate_score = 0
            for word in word_list:
                candidate_score += word_score[word]
            keyword_candidates[phrase] = candidate_score
        return keyword_candidates

[docs]    def run(self, text):
        """
        Extracts keywords from the text.

        :param str text: Text to be processed.
        :return: list of keywords.
        """
        sentence_list = NLPExtractor.split_sentences(text)
        phrase_list = NLPExtractor.generate_candidate_keywords(
            sentence_list, self.stop_words_pattern)
        word_scores = NLPExtractor.calculate_word_scores(phrase_list)
        keyword_candidates = NLPExtractor.generate_candidate_keyword_scores(
            phrase_list,
            word_scores)
        sorted_keywords = sorted(keyword_candidates.iteritems(),
                                 key=operator.itemgetter(1), reverse=True)

        keywords = []
        for keyword, _ in sorted_keywords:
            keywords.append(keyword)

        return keywords