Email Harvester Source Code

yellowcat

Regular Member
Joined
Aug 27, 2015
Messages
355
Reaction score
244
I was drunk when i wrote this
Code:
# All cats are yellow
import threading,re
import time,requests
from threading import Lock
from urlparse import urlparse
from bs4 import BeautifulSoup
from random import randint
import os
import unicodecsv
ackLogFile = open("stats.csv", "ab")
writer = unicodecsv.writer(ackLogFile, encoding='utf-8')
header = ["account", "total_followers", "total_following", "timestamp"]
writer.writerow(header)

class Crawler():
    def __init__(self,domains):
        with open('config.txt','rb') as f:
            data = f.readlines()
        for x in data:
            if "threads " in x:
                try:
                    threads = int(x.strip().split("threads = ")[1].strip())
                except:
                    print "config.txt has not formatted correctly"
                    print "proper format is the following"
                    print "threads = 20"


        self.mutex = Lock()
        self.hmutex = Lock()
        self.loot = []
        self.urls = []
        self.found = 0
        self.burnt = []
        self.filey = open("loot.csv","wb")
        header = ["email"]
        self.writer = unicodecsv.writer(self.filey, encoding='utf-8')
        self.writer.writerow(header)
        self.filey.flush()

        ##Append start urlz
        for domain in domains:
            parsed_uri = urlparse(domain)
            mainDomain = '{uri.netloc}'.format(uri=parsed_uri)
            scheme = '{uri.scheme}://'.format(uri=parsed_uri)
            self.urls.append({"mainDomain": mainDomain, "scheme": scheme, "url": domain})
            self.burnt.append(domain)

        self.total = 0
        self.html = []

        self.harvestThreadz = []
        for x in range(0,threads):
            x = threading.Thread(target=self.harvest)
            self.harvestThreadz.append(x)
            x.start()
        sortThread = threading.Thread(target=self.sort)
        sortThread.start()


        self.stats()
    def stats(self):

        try:
            os.system("color e")
            os.system("mode con: cols=100 lines=17")
        except:
            pass
        s1 = "\n"*17 + """
             \    /\\
              )  ( ')    Urls to Scrape => %s
             (  /  )     Urls Scraped => %s
              \(__)|     Emails found =>   %s
               """

        s2 = "\n"*17 + """
             \    /\\
              )  ( ') Created By YellowCat
             (  /  )  Skype @yellowcat1771
              \(__)|  All Cats Are Yellow
               """

        while True:
            print s1 % (str(len(self.urls)),str(self.total),str(self.found))
            time.sleep(1)
            os.system("cls")
            print s2
            time.sleep(1)
            os.system("cls")

    def harvest(self):
        while True:
            while True: #Wait for urls!!!
                self.mutex.acquire()
                if self.urls:
                    urlData = self.urls.pop(randint(0,len(self.urls)-1))
                    #with open("scraped.txt","ab") as ff:
                    #    try:
                    #        ff.write(urlData['url']+"\n")
                    #    except:
                    #        pass
                    self.mutex.release()
                    break
                self.mutex.release()
            try:
                #print "Fetching Url => ", urlData
                r = requests.get(urlData["url"],timeout=15)

                self.hmutex.acquire()
                self.total+=1
                self.html.append({"urlData":urlData,"html":r.text})
                self.hmutex.release()
            except Exception,e:
                #print e
                pass

    def sort(self):
        emailsBurnt = []

        while True:
            while True:#Wait for new htmls!!!
                self.hmutex.acquire()
                if self.html:
                    data = self.html.pop()
                    html = data["html"]
                    mainDomain = data["urlData"]["mainDomain"]
                    scheme = data['urlData']['scheme']

                    self.hmutex.release()
                    break
                self.hmutex.release()
            soupy = BeautifulSoup(html,"lxml")
            try:
                text = str(soupy)
                #regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
                #                    "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
                #                    "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))
                regex = re.compile(("[\w\.-][email protected][\w\.-]+\.\w+"))

                emails = re.findall(regex, text)
                if emails:
                    for email in emails:
                        #email = email[0]
                        if email.startswith("'"):
                            email = email.replace("'","")
                        if email.startswith("`"):
                            email = email.replace("`","")

                        if email not in emailsBurnt and "@" in email:
                            if not ".html" in email:
                                if not ".jpg" in email:
                                    if not "/" in email:

                                        self.mutex.acquire()
                                        try:
                                            emailsBurnt.append(email)
                                            self.writer.writerow([email])
                                            self.filey.flush()
                                            self.found+=1
                                        except:
                                            pass
                                        self.mutex.release()

            except:
                pass
            anchors = soupy.find_all("a")
            linkz = soupy.find_all("link")
            for x in linkz:
                anchors.append(x)
            for ank in anchors:
                try:
                    href = ank['href']
                    if href not in self.burnt:
                        #print href
                        self.mutex.acquire()
                        self.burnt.append(href)
                        self.mutex.release()
                        if mainDomain in href:
                            try:
                                url = href.split(mainDomain)
                                if len(url) > 1 and len(url[1].strip()) > 1:
                                    url[1] = url[1].strip()
                                    while url[1][0] == '/':
                                        url[1] = url[1][1:]
                                    url = scheme + mainDomain + "/" + url[1]
                                    self.mutex.acquire()
                                    self.urls.append({"mainDomain": mainDomain, "scheme": scheme, "url": url})
                                    self.mutex.release()
                            except:
                                pass


                         #Test for external domains!!
                        elif mainDomain not in href:
                            pass
                        else:
                            try:
                                if href[0] == '/':
                                    while href[0] == "/":
                                        href = href[1:]
                                    url = scheme + mainDomain + href
                                    self.mutex.acquire()
                                    self.urls.append({"mainDomain": mainDomain, "scheme": scheme, "url": url})
                                    self.mutex.release()
                            except:
                                pass


                except:
                    pass





with open("domains.txt","rb") as f:
    domains = [x.strip() for x in f.readlines()]
for domain in domains:
    parsed_uri = urlparse(domain)
    if not "http" in '{uri.scheme}://'.format(uri=parsed_uri):
        print "Domain error => " ,domain
        print "Please add https:// or http:// to the start of the url!"
        raw_input(">> Press enter to close")
print "Loaded ", len(domains), "total domains!"


if domains:
    Crawler(domains)
else:
    print "NO DOMAINS FOUND PLEASE ENTER DOMAINS IN domains.txt"

Code:
[config.txt]
threads = 20
[domains.txt]
 

Samtondon

Jr. VIP
Jr. VIP
Joined
May 29, 2015
Messages
887
Reaction score
118
Top