GosuSEO
Junior Member
- Nov 11, 2022
- 154
- 73
The code is a script that filters a list of keywords from a text file to retain only those that are related to a specific topic, in this case, "Search Engine Optimization (SEO)". The code performs the following tasks:import openai
import os
from translate import Translator
import concurrent.futures
from tqdm import tqdm
import backoff
import requests.exceptions
# Set your API key
api_key = "Your OpenAI API Key"
def read_keywords(file_path):
with open(file_path, "r", encoding="utf-8") as file:
for line in file:
yield line.strip()
def is_english(text):
try:
text.encode('ascii')
return True
except UnicodeEncodeError:
return False
def translate_to_english(text):
translator = Translator(to_lang="en")
translated = translator.translate(text)
return translated
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=10)
def is_keyword_related(topic, keyword):
openai.api_key = api_key
prompt = f"Given the topic '{topic}', is the keyword '{keyword}' related to the topic? (yes or no, only say Yes or no and you must not say anything else.)\n"
response = openai.Completion.create(
engine="text-davinci-003",
prompt=prompt,
max_tokens=10,
n=1,
stop=None,
temperature=0.7,
)
output = response.choices[0].text.strip().lower()
return output == "yes"
def process_keyword(keyword):
if not is_english(keyword):
keyword = translate_to_english(keyword)
is_related = is_keyword_related(topic, keyword)
print(f"Keyword: {keyword}, Is related? {'Yes' if is_related else 'No'}")
if is_related:
return keyword
return None
# Define your topic
topic = "Search Engine Optimization(SEO)"
# Load your keyword list line by line
keyword_generator = list(read_keywords("keyword_plastic_surgery.txt"))
# Process keywords using 100 threads
filtered_keywords = []
thread_count = 20
batch_size = 100
batch_count = 1
with concurrent.futures.ThreadPoolExecutor(max_workers=thread_count) as executor:
futures = {executor.submit(process_keyword, keyword): keyword for keyword in keyword_generator}
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing keywords"):
result = future.result()
if result is not None:
filtered_keywords.append(result)
# Save the filtered keywords to a new .txt file every 100 results
if len(filtered_keywords) == batch_size:
with open(f"filtered_keywords_batch_{batch_count}.txt", "w", encoding="utf-8") as file:
for kw in filtered_keywords:
file.write(kw + "\n")
filtered_keywords = []
batch_count += 1
# Save the remaining filtered keywords to the final batch file
if filtered_keywords:
with open(f"filtered_keywords_batch_{batch_count}.txt", "w", encoding="utf-8") as file:
for kw in filtered_keywords:
file.write(kw + "\n")
# Combine all batch files into one file
with open("filtered_keywords_combined.txt", "w", encoding="utf-8") as combined_file:
for i in range(1, batch_count + 1):
with open(f"filtered_keywords_batch_{i}.txt", "r", encoding="utf-8") as batch_file:
for line in batch_file:
combined_file.write(line)
# Delete the batch file after combining
os.remove(f"filtered_keywords_batch_{i}.txt")
print("All filtered keywords combined into filtered_keywords_combined.txt")
- Reads keywords from a text file line by line.
- Checks if the keyword is in English, and if not, translates it to English.
- Determines if the keyword is related to the topic using OpenAI's GPT-3 model (text-davinci-003 engine). The function is_keyword_related uses the GPT-3 API to evaluate the relationship between the keyword and the topic. It also implements exponential backoff to handle request exceptions.
- Processes the keywords using a ThreadPoolExecutor with 20 threads to concurrently process the keywords, improving efficiency.
- Writes the filtered keywords to multiple batch files, with a specified batch size (100 in this case).
- Combines all batch files into a single "filtered_keywords_combined.txt" file and deletes the individual batch files.
The script can be costy: It utilizes GPT-3. It took about 25$ credit for 30k keyword sanitizing.
It does have false positives, but very good at filtering unrelated keywords.
Fund me if you're good! ETH|ARB: 0x4a25FADB34870E0e7ecF6f3AF5EC7066B6Ebc690