From 7f026c8a4ce7d1137f01af1fba9476130904a8ae Mon Sep 17 00:00:00 2001 From: 2O4 <35725720+2O4@users.noreply.github.com> Date: Mon, 13 Jul 2020 16:03:51 +0200 Subject: [PATCH] formating --- .gitignore | 4 +- onioningestor/config.py | 112 ++++++++++--------- onioningestor/dbhandler.py | 7 +- onioningestor/loghandler.py | 2 +- onioningestor/operators/__init__.py | 3 +- onioningestor/operators/html.py | 158 +++++++++++++++------------ onioningestor/operators/onionscan.py | 67 ++++++------ onioningestor/operators/yara.py | 1 + 8 files changed, 190 insertions(+), 164 deletions(-) diff --git a/.gitignore b/.gitignore index 8a26afc..c252f26 100644 --- a/.gitignore +++ b/.gitignore @@ -3,10 +3,12 @@ webui templates OnionIngestor.egg-info screenshots -dump.rdb +dump.rdb onionscandb config.yml *.log *.pyc __pycache__ venv +.vscode +build diff --git a/onioningestor/config.py b/onioningestor/config.py index 096c6cf..f7cd6cc 100644 --- a/onioningestor/config.py +++ b/onioningestor/config.py @@ -6,42 +6,41 @@ import yaml from pathlib import Path -SOURCE = 'onioningestor.sources' -OPERATOR = 'onioningestor.operators' +SOURCE = "onioningestor.sources" +OPERATOR = "onioningestor.operators" INTERNAL_OPTIONS = [ - 'saved_state', - 'module', - 'credentials', + "saved_state", + "module", + "credentials", ] - -ARTIFACT_TYPES = 'artifact_types' -FILTER_STRING = 'filter' -ALLOWED_SOURCES = 'allowed_sources' -NAME = 'name' +ARTIFACT_TYPES = "artifact_types" +FILTER_STRING = "filter" +ALLOWED_SOURCES = "allowed_sources" +NAME = "name" class Config: """Config read/write operations, and convenience methods.""" + def __init__(self, filename, logger): """Read a config file.""" self.logger = logger self.filename = filename - with io.open(self.filename, 'r') as f: + with io.open(self.filename, "r") as f: try: self.logger.info("Loading config file") self.config = yaml.safe_load(f.read()) except yaml.error.YAMLError: self.logger.error("YAML error in config") - @staticmethod def _load_plugin(plugin_type, plugin): """Returns plugin class or raises an exception. :raises: threatingestor.exceptions.PluginError """ try: - module = importlib.import_module('.'.join([plugin_type, plugin])) + module = importlib.import_module(".".join([plugin_type, plugin])) return module.Plugin except Exception as e: print(e) @@ -49,89 +48,86 @@ class Config: def daemon(self): """Returns boolean, are we daemonizing?""" - return self.config['general']['daemon'] - + return self.config["general"]["daemon"] def elasticsearch(self): """Returns elasticsaerch config""" - return self.config['general']['elasticsearch'] - + return self.config["general"]["elasticsearch"] def sleep(self): """Returns number of seconds to sleep between iterations, if daemonizing.""" - return self.config['general']['sleep'] + return self.config["general"]["sleep"] def blacklist(self): - return self.config['general']['blacklist'].split(',') - -# def onionscanner(self): -# """Returns onionscanner config dict""" -# screenshots = self.config['onionscanner'].pop('screenshots_path', None) -# if screenshots: -# self.config['onionscanner']['screenshots_path'] = Path(screenshots) -# else: -# self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots' -# blacklist = self.config['onionscanner'].pop('blacklist', None) -# if blacklist: -# self.config['onionscanner']['blacklist'] = blacklist.split(',') -# interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None) -# if interestingKeywords: -# self.config['onionscanner']['interestingKeywords'] = blacklist.split(',') -# return self.config['onionscanner'] - + return self.config["general"]["blacklist"].split(",") + + # def onionscanner(self): + # """Returns onionscanner config dict""" + # screenshots = self.config['onionscanner'].pop('screenshots_path', None) + # if screenshots: + # self.config['onionscanner']['screenshots_path'] = Path(screenshots) + # else: + # self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots' + # blacklist = self.config['onionscanner'].pop('blacklist', None) + # if blacklist: + # self.config['onionscanner']['blacklist'] = blacklist.split(',') + # interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None) + # if interestingKeywords: + # self.config['onionscanner']['interestingKeywords'] = blacklist.split(',') + # return self.config['onionscanner'] def notifiers(self): """Returns notifiers config dictionary.""" - return self.config.get('notifiers', {}) - + return self.config.get("notifiers", {}) def logging(self): """Returns logging config dictionary.""" - return self.config.get('logging', {}) - + return self.config.get("logging", {}) def credentials(self, credential_name): """Return a dictionary with the specified credentials.""" - for credential in self.config['credentials']: + for credential in self.config["credentials"]: for key, value in credential.items(): if key == NAME and value == credential_name: return credential return {} - def sources(self): """Return a list of (name, Source class, {kwargs}) tuples. :raises: threatingestor.exceptions.PluginError """ sources = [] - for source in self.config['sources']: + for source in self.config["sources"]: kwargs = {} for key, value in source.items(): if key not in INTERNAL_OPTIONS: kwargs[key] = value - elif key == 'credentials': + elif key == "credentials": # Grab these named credentials credential_name = value - for credential_key, credential_value in self.credentials(credential_name).items(): + for credential_key, credential_value in self.credentials( + credential_name + ).items(): if credential_key != NAME: kwargs[credential_key] = credential_value # load and initialize the plugin self.logger.info(f"Found source '{source[NAME]}'") - sources.append((source[NAME], self._load_plugin(SOURCE, source['module']), kwargs)) + sources.append( + (source[NAME], self._load_plugin(SOURCE, source["module"]), kwargs) + ) self.logger.info(f"Found {len(sources)} total sources") return sources - def operators(self): """Return a list of (name, Operator class, {kwargs}) tuples. :raises: threatingestor.exceptions.PluginError """ operators = [] - for operator in self.config['operators']: + for operator in self.config["operators"]: kwargs = {} for key, value in operator.items(): if key not in INTERNAL_OPTIONS: @@ -140,7 +136,11 @@ class Config: artifact_types = [] for artifact in value: try: - artifact_types.append(threatingestor.artifacts.STRING_MAP[artifact.lower().strip()]) + artifact_types.append( + threatingestor.artifacts.STRING_MAP[ + artifact.lower().strip() + ] + ) except KeyError: # ignore invalid artifact types pass @@ -148,7 +148,7 @@ class Config: elif key == FILTER_STRING: # pass in special filter_string option - kwargs['filter_string'] = value + kwargs["filter_string"] = value elif key == NAME: # exclude name key from operator kwargs, since it's not used @@ -157,16 +157,24 @@ class Config: else: kwargs[key] = value - elif key == 'credentials': + elif key == "credentials": # Grab these named credentials credential_name = value - for credential_key, credential_value in self.credentials(credential_name).items(): + for credential_key, credential_value in self.credentials( + credential_name + ).items(): if credential_key != NAME: kwargs[credential_key] = credential_value # load and initialize the plugin self.logger.info(f"Found operator '{operator[NAME]}'") - operators.append((operator[NAME], self._load_plugin(OPERATOR, operator['module']), kwargs)) + operators.append( + ( + operator[NAME], + self._load_plugin(OPERATOR, operator["module"]), + kwargs, + ) + ) self.logger.info(f"Found {len(operators)} total operators") return operators diff --git a/onioningestor/dbhandler.py b/onioningestor/dbhandler.py index d4cb213..a8f6b7d 100644 --- a/onioningestor/dbhandler.py +++ b/onioningestor/dbhandler.py @@ -8,7 +8,7 @@ class DbHandlerElasticSearch: self.logger = logger self.logger.info('Creating Elasticsearch mapping') self.config = config - self.mapping = ''' + self.mapping = """ { "mappings": { "_doc": { @@ -18,7 +18,7 @@ class DbHandlerElasticSearch: }, "blacklist": { "type": "keyword" - }, + }, "monitor": { "type": "boolean", "null_value": "false" @@ -50,7 +50,7 @@ class DbHandlerElasticSearch: } } } - ''' + """ self.index = self.config['index'] try: self.es = Elasticsearch([{ @@ -87,4 +87,3 @@ class DbHandlerElasticSearch: status = self.es.index(index=self.index,body=data) self.count() return status - diff --git a/onioningestor/loghandler.py b/onioningestor/loghandler.py index 1189b4e..61af88e 100644 --- a/onioningestor/loghandler.py +++ b/onioningestor/loghandler.py @@ -2,6 +2,7 @@ import os import logging from pathlib import Path + class LoggerHandler(): def __init__(self, level): self.level = getattr(logging, level) @@ -30,4 +31,3 @@ class LoggerHandler(): def start_logging(self): self.logger.info('Starting OnionScraper') return self.logger - diff --git a/onioningestor/operators/__init__.py b/onioningestor/operators/__init__.py index 7bb083a..c2bfd1e 100644 --- a/onioningestor/operators/__init__.py +++ b/onioningestor/operators/__init__.py @@ -19,6 +19,7 @@ class Operator: method name with an underscore to denote a ``_private_method``. Do not override other existing methods from this class. """ + def __init__(self, logger, elasticsearch, allowed_sources=None): """Override this constructor in child classes. @@ -87,7 +88,7 @@ class Operator: def collect(self, onions): for onion in onions: self.logger.info(f'thread function processing {onion}') - # Add link to database + # Add link to database db = self.es.save({ 'hiddenService':onion.url, 'monitor':'false', diff --git a/onioningestor/operators/html.py b/onioningestor/operators/html.py index 8ccf821..d55ec64 100644 --- a/onioningestor/operators/html.py +++ b/onioningestor/operators/html.py @@ -24,24 +24,27 @@ class Plugin(Operator): def __init__(self, logger, elasticsearch, allowed_sources, **kwargs): super(Plugin, self).__init__(logger, elasticsearch, allowed_sources) - self.plugin_name = 'simple-html' + self.plugin_name = "simple-html" self.logger.info(f"Initializing {self.plugin_name}") - self.timeout = int(kwargs['timeout']) - self.retries = int(kwargs['retries']) - - interesting = kwargs['interestingKeywords'].split(',') - self.interesting = re.compile('|'.join([re.escape(word) for word in interesting]), re.IGNORECASE) - - self.proxy = kwargs['socks5'] - self.torControl = kwargs['TorController'] - self.headers ={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language':'en-US,en;q=0.5', - 'DNT': '1', 'Connection': - 'keep-alive', - 'Upgrade-Insecure-Requests': '1'} + self.timeout = int(kwargs["timeout"]) + self.retries = int(kwargs["retries"]) + + interesting = kwargs["interestingKeywords"].split(",") + self.interesting = re.compile( + "|".join([re.escape(word) for word in interesting]), re.IGNORECASE + ) + + self.proxy = kwargs["socks5"] + self.torControl = kwargs["TorController"] + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + } def get_tor_session(self): try: @@ -54,69 +57,84 @@ class Plugin(Operator): return s def renew_connection(self): - with Controller.from_port(port = self.torControl['port']) as controller: + with Controller.from_port(port=self.torControl["port"]) as controller: # Now we switch TOR identities to make sure we have a good connection - self.logger.info('Getting new Tor IP') + self.logger.info("Getting new Tor IP") # authenticate to our local TOR controller - controller.authenticate(self.torControl['password']) + controller.authenticate(self.torControl["password"]) # send the signal for a new identity controller.signal(Signal.NEWNYM) # wait for the new identity to be initialized time.sleep(controller.get_newnym_wait()) session = self.get_tor_session() - self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}") + self.logger.info( + f"IP is {session.get('http://httpbin.org/ip').json()['origin']}" + ) def run_sessions(self, onion): - retry = 0 - result = None - while True: - try: - url = 'http://'+onion - self.logger.info(url) - content = self.get_tor_session().get(url) - if content.status_code == 200: - result = content.text - if result: - html = BeautifulSoup(result,features="lxml") - # testing hardcorded filepath - with open("/home/tony/Projects/OnionScraper_v2/onion_master_list.txt", "w") as fp: - for onion in re.findall('([a-z2-7]{16,56}\.onion)',result): - fp.write("%s\n" % onion) - if html: - index = { - 'HTML':result, - 'title':html.title.text, - 'language':detect(html.text), - 'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z', - 'status':'success', - 'interestingKeywords':list(set(self.interesting.findall(result))) - } - else: - index = { - 'HTML':result, - 'title': None, - 'language': None, - 'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z', - 'status':'success', - 'interestingKeywords':list(set(self.interesting.findall(result))) - } - return self.response(index, onion, self.plugin_name) - except requests.exceptions.ConnectionError as connection_error: - self.logger.error(f'Failed connecting to http://{url}') - self.logger.debug(connection_error) - except Exception as e: - self.logger.error(e) - self.logger.debug(traceback.print_exc()) - - self.logger.info('[x] No results found retrying ...') - retry += 1 - self.renew_connection() - if retry > self.retries: - self.logger.error('[x] Max retries exceeded') - return self.response({'status':"failure"}, onion, self.plugin_name) + retry = 0 + result = None + while True: + try: + url = "http://" + onion + self.logger.info(url) + content = self.get_tor_session().get(url) + if content.status_code == 200: + result = content.text + if result: + html = BeautifulSoup(result, features="lxml") + # testing hardcorded filepath + with open( + "/home/tony/Projects/OnionScraper_v2/onion_master_list.txt", + "w", + ) as fp: + for onion in re.findall("([a-z2-7]{16,56}\.onion)", result): + fp.write("%s\n" % onion) + if html: + index = { + "HTML": result, + "title": html.title.text, + "language": detect(html.text), + "date-crawled": dt.utcnow().strftime( + "%Y-%m-%dT%H:%M:%S.%f" + ) + + "Z", + "status": "success", + "interestingKeywords": list( + set(self.interesting.findall(result)) + ), + } + else: + index = { + "HTML": result, + "title": None, + "language": None, + "date-crawled": dt.utcnow().strftime( + "%Y-%m-%dT%H:%M:%S.%f" + ) + + "Z", + "status": "success", + "interestingKeywords": list( + set(self.interesting.findall(result)) + ), + } + return self.response(index, onion, self.plugin_name) + except requests.exceptions.ConnectionError as connection_error: + self.logger.error(f"Failed connecting to http://{url}") + self.logger.debug(connection_error) + except Exception as e: + self.logger.error(e) + self.logger.debug(traceback.print_exc()) + + self.logger.info("[x] No results found retrying ...") + retry += 1 + self.renew_connection() + if retry > self.retries: + self.logger.error("[x] Max retries exceeded") + return self.response({"status": "failure"}, onion, self.plugin_name) def handle_onion(self, db, onion): content = self.run_sessions(onion) - if content[self.plugin_name]['status'] == 'success': - if self._onion_is_allowed(content, db, 'HTML'): - self.es.update(db['_id'], content) + if content[self.plugin_name]["status"] == "success": + if self._onion_is_allowed(content, db, "HTML"): + self.es.update(db["_id"], content) diff --git a/onioningestor/operators/onionscan.py b/onioningestor/operators/onionscan.py index 833e72b..3367d0c 100644 --- a/onioningestor/operators/onionscan.py +++ b/onioningestor/operators/onionscan.py @@ -24,6 +24,7 @@ from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from onionscraper.operators import Operator + class Plugin(Operator): """OnionScraper main work logic. @@ -49,8 +50,8 @@ class Plugin(Operator): 'Accept-Language':'en-US,en;q=0.5', 'DNT': '1', 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1'} - + 'Upgrade-Insecure-Requests': '1' + } blacklist = kwargs['blacklist'].split(',') self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE) @@ -110,9 +111,10 @@ class Plugin(Operator): options = Options() options.headless = True driver = webdriver.Firefox( - executable_path='/home/tony/Projects/OnionScraper/geckodriver', - options=options, - firefox_profile=fp) + executable_path='/home/tony/Projects/OnionScraper/geckodriver', + options=options, + firefox_profile=fp + ) url = 'http://' + onion driver.get(url) uid = str(uuid4()).split('-')[0] @@ -131,8 +133,6 @@ class Plugin(Operator): self.logger.error('[x] Unable to take screenshot') return self.response("failure",None,onion) - - def get_tor_session(self): try: s = requests.session() @@ -172,31 +172,31 @@ class Plugin(Operator): return def run_sessions(self, onion): - retry = 0 - result = None - while True: - try: - url = 'http://'+onion - self.logger.info(url) - content = self.session.get(url) - if content.status_code == 200: - result = content.json() - except JSONDecodeError as e: - self.logger.debug(f'JSONDecodeError {e}') - result = content.text - except Exception as e: - self.logger.error(e) - self.logger.debug(traceback.print_exc()) - finally: - if result: - return self.response("success",result,onion) - else: - self.logger.info('[x] No results found retrying ...') - retry += 1 - self.renew_connection() - if retry > self.retries: - self.logger.error('[x] Max retries exceeded') - return self.response("failure",None, onion) + retry = 0 + result = None + while True: + try: + url = 'http://'+onion + self.logger.info(url) + content = self.session.get(url) + if content.status_code == 200: + result = content.json() + except JSONDecodeError as e: + self.logger.debug(f'JSONDecodeError {e}') + result = content.text + except Exception as e: + self.logger.error(e) + self.logger.debug(traceback.print_exc()) + finally: + if result: + return self.response("success",result,onion) + else: + self.logger.info('[x] No results found retrying ...') + retry += 1 + self.renew_connection() + if retry > self.retries: + self.logger.error('[x] Max retries exceeded') + return self.response("failure",None, onion) def run_onionscan(self, onion): self.logger.info("[*] Running onionscan on %s", onion) @@ -259,6 +259,3 @@ class Plugin(Operator): finally: pass #sys.exit(0) - - - diff --git a/onioningestor/operators/yara.py b/onioningestor/operators/yara.py index 794c093..e23dc26 100644 --- a/onioningestor/operators/yara.py +++ b/onioningestor/operators/yara.py @@ -1,6 +1,7 @@ from onionscraper.operators import Operator + class Plugin(Operator): """Operator for output to flat CSV file.""" def __init__(self, filename, base_score):