GosuSEO
Junior Member
- Nov 11, 2022
- 154
- 73
This script imports necessary libraries and defines functions to sanitize text in a given Excel file. The script's main function is sanitize_xlsx, which takes an input file path and an output file path as arguments. It reads the input Excel file, sanitizes the text in each cell, and saves the sanitized data in a new Excel file with the specified output file path.import re
import nltk
from nltk.corpus import wordnet
from spellchecker import SpellChecker
import openpyxl
from openpyxl import Workbook
# Make sure to download the required resources
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
def sanitize_text(text):
# Remove non-English characters
sanitized_text = re.sub(r"[^a-zA-Z0-9\s.,?!'\"]", '', text)
# Tokenize words and parts of speech
words = nltk.word_tokenize(sanitized_text)
tagged = nltk.pos_tag(words)
# Check for incorrect words and try to suggest corrections
corrected_words = []
for word, pos in tagged:
synsets = wordnet.synsets(word, pos=nltk_to_wordnet(pos))
if not synsets:
# Try to suggest a correction
suggestions = find_closest_word(word)
if suggestions:
corrected_word = suggestions
else:
# If no suggestion found, find the closest word using WordNet
corrected_word = find_closest_word_wordnet(word)
corrected_words.append(corrected_word)
else:
corrected_words.append(word)
# Reconstruct the sentence
corrected_text = ' '.join(corrected_words)
return corrected_text
def nltk_to_wordnet(pos):
if pos.startswith('J'):
return wordnet.ADJ
elif pos.startswith('V'):
return wordnet.VERB
elif pos.startswith('N'):
return wordnet.NOUN
elif pos.startswith('R'):
return wordnet.ADV
else:
return None
def find_closest_word(input_word):
spell = SpellChecker()
# Get the one most likely correct word
corrected_word = spell.correction(input_word)
return corrected_word
def find_closest_word_wordnet(input_word):
all_synsets = list(wordnet.all_synsets())
closest_synset = None
min_distance = float("inf")
for synset in all_synsets:
for lemma in synset.lemmas():
word = lemma.name()
distance = nltk.edit_distance(input_word, word)
if distance < min_distance:
min_distance = distance
closest_synset = synset
if closest_synset:
return closest_synset.lemmas()[0].name()
else:
return input_word
def sanitize_xlsx(input_file, output_file):
# Load the workbook
wb = openpyxl.load_workbook(input_file)
# Create a new workbook for sanitized data
sanitized_wb = Workbook()
print("Sanitizing sheets...")
# Iterate through sheets
for sheet in wb:
print(f"Sanitizing sheet: {sheet.title}")
sanitized_sheet = sanitized_wb.create_sheet(sheet.title)
# Iterate through rows and columns using max_row and max_column
for row_idx in range(1, sheet.max_row + 1):
for col_idx in range(1, sheet.max_column + 1):
cell = sheet.cell(row=row_idx, column=col_idx)
# Sanitize cell value if it's a non-empty string
if isinstance(cell.value, str) and cell.value.strip():
print(f"Sanitizing cell ({row_idx}, {col_idx})")
sanitized_value = sanitize_text(cell.value)
sanitized_sheet.cell(row=row_idx, column=col_idx, value=sanitized_value)
else:
sanitized_sheet.cell(row=row_idx, column=col_idx, value=cell.value)
# Remove the first empty sheet created by default
sanitized_wb.remove(sanitized_wb.active)
print("Saving sanitized workbook...")
# Save the sanitized workbook
sanitized_wb.save(output_file)
print("Sanitization complete.")
# Example usage
input_file = "input.xlsx" # Replace with your input file path
output_file = "sanitized_output.xlsx" # Replace with your desired output file path
sanitize_xlsx(input_file, output_file)
The text sanitization process consists of:
- Removing non-English characters from the text.
- Tokenizing words and parts of speech using the NLTK library.
- Checking for incorrect words and suggesting corrections using the SpellChecker library and WordNet from the NLTK library.
- Reconstructing the sentence with corrected words.
To use this script, update the input_file and output_file variables with your desired input and output file paths and run the script. The script will sanitize the text in the input file and save the sanitized data to the output file.