Source code for preprocessing.text

'''
Text pre-processing module:
'''


from preprocessing.errors import FunctionError, InputError
import preprocessing.spellcheck as spellcheck

import html
import json
from os import path
import re
import string

import nltk.data
nltk.data.path = [path.join(path.dirname(__file__), "data")]
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer


KEYWORD_TOKENIZER = RegexpTokenizer(r'\b[\w.\/,-]+\b|[-.,\/()]')
LEMMATIZER = WordNetLemmatizer()
LIGATURES = json.load(open(path.join(path.dirname(__file__), "data/latin_characters.json"), "r"))
NUMBER_WORDS = [NUMBER_WORD.replace("\n", "") for NUMBER_WORD in open(path.join(path.dirname(__file__), "data/word_numbers.txt"), "r").readlines()]
PUNCT = string.punctuation
STOPWORDS = stopwords.words("english")
SENTENCE_TOKENIZER = nltk.data.load("tokenizers/punkt/english.pickle")
TIME_WORDS = [TIME_WORD.replace("\n", "") for TIME_WORD in open(path.join(path.dirname(__file__), "data/word_time.txt"), "r").readlines()]


#functions
[docs]def convert_html_entities(text_string): ''' Converts HTML5 character references within text_string to their corresponding unicode characters and returns converted string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a non-string argument be passed ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): return html.unescape(text_string).replace(""", "'") else: raise InputError("string not passed as argument for text_string")
[docs]def convert_ligatures(text_string): ''' Coverts Latin character references within text_string to their corresponding unicode characters and returns converted string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a string or NoneType not be passed as an argument ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): for i in range(0, len(LIGATURES)): text_string = text_string.replace(LIGATURES[str(i)]["ligature"], LIGATURES[str(i)]["term"]) return text_string else: raise InputError("none type or string not passed as an argument")
[docs]def correct_spelling(text_string): ''' Splits string and converts words not found within a pre-built dictionary to their most likely actual word based on a relative probability dictionary. Returns edited string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a string or NoneType not be passed as an argument ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): word_list = text_string.split() spellchecked_word_list = [] for word in word_list: spellchecked_word_list.append(spellcheck.correct_word(word)) return " ".join(spellchecked_word_list) else: raise InputError("none type or string not passed as an argument")
[docs]def create_sentence_list(text_string): ''' Splits text_string into a list of sentences based on NLTK's english.pickle tokenizer, and returns said list as type list of str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a non-string argument be passed ''' if text_string is None or text_string == "": return [] elif isinstance(text_string, str): return SENTENCE_TOKENIZER.tokenize(text_string) else: raise InputError("non-string passed as argument for create_sentence_list")
[docs]def keyword_tokenize(text_string): ''' Extracts keywords from text_string using NLTK's list of English stopwords, ignoring words of a length smaller than 3, and returns the new string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a non-string argument be passed ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): return " ".join([word for word in KEYWORD_TOKENIZER.tokenize(text_string) if word not in STOPWORDS and len(word) >= 3]) else: raise InputError("string not passed as argument for text_string")
[docs]def lemmatize(text_string): ''' Returns base from of text_string using NLTK's WordNetLemmatizer as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a non-string argument be passed ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): return LEMMATIZER.lemmatize(text_string) else: raise InputError("string not passed as primary argument")
[docs]def lowercase(text_string): ''' Converts text_string into lowercase and returns the converted string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a non-string argument be passed ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): return text_string.lower() else: raise InputError("string not passed as argument for text_string")
[docs]def preprocess_text(text_string, function_list): ''' Given each function within function_list, applies the order of functions put forward onto text_string, returning the processed string as type str. Keyword argument: - function_list: list of functions available in preprocessing.text - text_string: string instance Exceptions raised: - FunctionError: occurs should an invalid function be passed within the list of functions - InputError: occurs should text_string be non-string, or function_list be non-list ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): if isinstance(function_list, list): for func in function_list: try: text_string = func(text_string) except (NameError, TypeError): raise FunctionError("invalid function passed as element of function_list") except: raise return text_string else: raise InputError("list of functions not passed as argument for function_list") else: raise InputError("string not passed as argument for text_string")
[docs]def remove_esc_chars(text_string): ''' Removes any escape character within text_string and returns the new string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a non-string argument be passed ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): return " ".join(re.sub(r'\\\w', "", text_string).split()) else: raise InputError("string not passed as argument")
[docs]def remove_numbers(text_string): ''' Removes any digit value discovered within text_string and returns the new string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a non-string argument be passed ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): return " ".join(re.sub(r'\b[\d.\/,]+', "", text_string).split()) else: raise InputError("string not passed as argument")
[docs]def remove_number_words(text_string): ''' Removes any integer represented as a word within text_string and returns the new string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a non-string argument be passed ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): for word in NUMBER_WORDS: text_string = re.sub(r'[\S]*\b'+word+r'[\S]*', "", text_string) return " ".join(text_string.split()) else: raise InputError("string not passed as argument")
[docs]def remove_time_words(text_string): ''' Removes any word associated to time (day, week, month, etc.) within text_string and returns the new string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a non-string argument be passed ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): for word in TIME_WORDS: text_string = re.sub(r'[\S]*\b'+word+r'[\S]*', "", text_string) return " ".join(text_string.split()) else: raise InputError("string not passed as argument")
[docs]def remove_unbound_punct(text_string): ''' Removes all punctuation unattached from a non-whitespace or attached to another punctuation character unexpectedly (e.g. ".;';") within text_string and returns the new string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a non-string argument be passed ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): return " ".join(re.sub(r''.join([r'[', PUNCT, r'][', PUNCT, r']+|\B[', PUNCT, r']+']), "", text_string).split()) else: raise InputError("string not passed as argument")
[docs]def remove_urls(text_string): ''' Removes all URLs within text_string and returns the new string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a non-string argument be passed ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): return " ".join(re.sub(r'http\S+', "", text_string).split()) else: raise InputError("string not passed as argument")
[docs]def remove_whitespace(text_string): ''' Removes all whitespace found within text_string and returns new string as type str. Keyword argument: - text_string: string instance Exceptions raised: - InputError: occurs should a string or NoneType not be passed as an argument ''' if text_string is None or text_string == "": return "" elif isinstance(text_string, str): return " ".join(text_string.split()) else: raise InputError("none type or string not passed as an argument")