I'm new to the IM world so there isn't too much I can contribute but I figured I'd post my script for grabbing public proxies. It's been tested with python 2.7 on both Windows and linux. It requires selenium and Firefox. I have a bad habit of coding in a vacuum so feedback would be greatly appreciated. The script takes a csv file with the sites you want to scrape, uses selenium to dump all the text from the <body> tag, pulls ip port pairs with some regex, and saves the pairs to a text file. The script is just something I use so it's not production ready. It works for pages that have the ip port pairs next to each other in a table and sites that have colon separated pairs. I'm releasing this script to BHW under the WTFPL so you can do whatever you want with it. I'd love to better explain how to configure the program but the forum rules about n00bs makes it like pulling teeth from a honey badger. The fracking filter triggers on my --help style explanation of how to configure the script with the csv file (it's the first line of the pastebin text). I've put an example config in a pastebin post that only contains fake urls (to keep with the n00b rules) and if you have trouble just shoot me a PM with an email or something that I can contact you through (since n00bs can't PM.) The pastebin post is 6P4ecuCj For space all the modules are in this one code tag. I tried it with one module per code tag but that just takes up way too much space. Code: proxy_scraper_2.py ''' A threaded proxy scraper This script will scrape a list of websites for ip:port pairs. ''' import argparse import file_handler from strip import strip_proxies from scrape import run_scraper from multiprocessing import Process, Queue, JoinableQueue, cpu_count, Event from selenium import webdriver from selenium.common.exceptions import WebDriverException, NoSuchElementException def main(): # Initialize the argparser parser = argparse.ArgumentParser() parser.add_argument('sitelist', help='A CSV file containing a list of sites to scrape.') parser.add_argument('-j', '--jobs', help='The number of jobs to spawn while scraping.') parser.add_argument('-o', '--output', help='The name of the file that the proxylist is saved to') arguments = parser.parse_args() # Initialize the queues scrapeQueue = JoinableQueue() dumpQueue = Queue() # Set Options if arguments.jobs: try: numThreads = int(arguments.jobs) except ValueError: print "--jobs must be a whole number" return else: try: numThreads = cpu_count() except: numThreads = 1 if arguments.output: try: out = str(arguments.output) except: print 'Invalid output file, defaulting to proxy.txt' out = 'proxy.txt' else: out = 'proxy.txt' # Create empty proxy list proxylist =  # Start the scrapers for i in range(numThreads): worker = Process(target=ScrapeJob, args=(scrapeQueue, dumpQueue)) worker.start() #Load the scrapeQueue with open(arguments.sitelist, 'rb') as FILE: file_handler.load_queue(scrapeQueue, FILE) #Wait for the scrapers to finish scrapeQueue.join() #Regex the ip port pairs out of the textDumps while not dumpQueue.empty(): textDump = dumpQueue.get() strip_proxies(textDump, proxylist) #write it all to a file with open(out, 'wb') as FILE: file_handler.save_list_to_file(proxylist, FILE) FILE.close() return class ScrapeJob(Process): def __init__(self, scrape_queue, dump_queue): Process.__init__(self) self.exit = Event() self.driver = webdriver.Firefox() self.scrape_queue = scrape_queue self.dump_queue = dump_queue self.run() def run(self): while not self.scrape_queue.empty(): job_args = self.scrape_queue.get() name = job_args.pop(0) try: print 'Scraping ' + name run_scraper(self.driver, self.dump_queue, *job_args) print '[+] ' + name + ' has been scraped.' except WebDriverException as e: print ('[-] ' + name + ' WebDriver failed with exception: \n' + str(e)) except NoSuchElementException as e: print ('[-] ' + name + ' Scraper failed with NoSuchElementException: \n' + str(e)) except: print '[-] ' + name + ': Unknown Exception.' self.scrape_queue.task_done() self.shutdown() def shutdown(self): self.driver.close() self.exit.set() if __name__ == '__main__': main() ============================== scrape.py ''' This script takes a selenium webdriver and scrapes the specified site. The full text of the site is dumped to the provided queue for later processing. ''' def run_scraper(driver, queue, url, urlExtension='', startPage=None, endPage=None, fill=None): if startPage == None or endPage == None: scrape_single_page(driver, queue, url) else: scrape_multiple_pages(driver, queue, url, urlExtension, startPage, endPage, fill) def scrape_single_page(driver, queue, url): driver.get(url) dump_text(driver, queue) def scrape_multiple_pages(driver, queue, url, urlExtension, startPage=None, endPage=None, fill=None): for i in range(int(startPage), int(endPage) + 1): if fill == None: driver.get(url + str(i) + urlExtension) else: driver.get(url + str(i).zfill(int(fill)) + urlExtension) dump_text(driver, queue) def dump_text(driver, queue): dump = driver.find_element_by_tag_name('body') queue.put(dump.text) ============================== strip.py ''' This module pulls ip port pairs from a string of text and adds them to the provided list. ''' import re def strip_proxies(text, proxylist): #Setup the regular expressions to find the pairs ip_colon_port = re.compile('\d+\.\d+\.\d+\.\d+\:\d+') ip_space_port = re.compile('\d+\.\d+\.\d+\.\d+\s\d+') #Add everything that matches the regex to the proxylist proxylist.extend(ip_colon_port.findall(text)) proxylist.extend(ip_space_port.findall(text)) #Convert to ASCII then replace spaces with colons proxylist = [convert_to_ascii(proxy) for proxy in proxylist] proxylist = [replace_with_colon(proxy) for proxy in proxylist] def convert_to_ascii(text): return text.encode('ascii', 'ignore') def replace_with_colon(text): return text.replace(' ', ':') ============================== file_handler.py import csv def load_queue(queue, FILE): reader = csv.reader(FILE) for row in reader: if row and not ','.join(row).startswith('#'): queue.put(row) def load_list(the_list, FILE): reader = csv.reader(FILE) for row in reader: if row and not ','.join(row).startswith('#'): list.extend(row) def save_list_to_file(the_list, FILE): for string in the_list: FILE.write(string + "\n") I appologize in advance if the filter makes it impossible to contribute answers to questions about this script on the forum. I already burned over a half hour trying to post a docstring with no URLs or email addresses. Hopefully I can get my post count high enough to be allowed to contribute beyond just shooting the breeze in the lounge.