From 8258056befe1b0e022bbe1c5627d6b1f39caca65 Mon Sep 17 00:00:00 2001 From: danieleperera Date: Mon, 6 Jul 2020 22:43:35 +0000 Subject: [PATCH] renamed the package and fixed some issues --- onioningestor/__init__.py | 131 +++++++++++++ onioningestor/__main__.py | 49 +++++ onioningestor/config.py | 172 +++++++++++++++++ onioningestor/dbhandler.py | 75 ++++++++ onioningestor/loghandler.py | 33 ++++ onioningestor/operators/__init__.py | 95 ++++++++++ onioningestor/operators/html.py | 99 ++++++++++ onioningestor/operators/onionscan.py | 264 +++++++++++++++++++++++++++ onioningestor/operators/yara.py | 15 ++ onioningestor/sources/__init__.py | 41 +++++ onioningestor/sources/gist.py | 153 ++++++++++++++++ onioningestor/sources/gmail.py | 153 ++++++++++++++++ onioningestor/sources/reddit.py | 120 ++++++++++++ onioningestor/sources/simplefile.py | 31 ++++ 14 files changed, 1431 insertions(+) create mode 100644 onioningestor/__init__.py create mode 100644 onioningestor/__main__.py create mode 100644 onioningestor/config.py create mode 100644 onioningestor/dbhandler.py create mode 100644 onioningestor/loghandler.py create mode 100644 onioningestor/operators/__init__.py create mode 100644 onioningestor/operators/html.py create mode 100644 onioningestor/operators/onionscan.py create mode 100644 onioningestor/operators/yara.py create mode 100644 onioningestor/sources/__init__.py create mode 100644 onioningestor/sources/gist.py create mode 100644 onioningestor/sources/gmail.py create mode 100644 onioningestor/sources/reddit.py create mode 100644 onioningestor/sources/simplefile.py diff --git a/onioningestor/__init__.py b/onioningestor/__init__.py new file mode 100644 index 0000000..a5e655d --- /dev/null +++ b/onioningestor/__init__.py @@ -0,0 +1,131 @@ +import sys +import time +import traceback +import collections + +from . import config +from . import dbhandler +from . import loghandler + + +class Ingestor: + """ThreatIngestor main work logic. + + Handles reading the config file, calling sources, maintaining state, and + sending artifacts to operators. + """ + def __init__(self, args): + # Load logger + log = loghandler.LoggerHandler(args.logLevel) + self.logger = log.start_logging() + # Load config + self.config = config.Config(args.configFile, self.logger) + self.blacklist = self.config.blacklist() + + # Load Elasticsearch. + try: + self.es = dbhandler.DbHandlerElasticSearch( + self.config.elasticsearch(), + self.logger) + except Exception as e: + # Error loading elasticsearch. + self.logger.error(e) + self.logger.debug(traceback.print_exc()) + sys.exit(1) + + + # Instantiate plugins. + try: + self.logger.info("Initializing sources") + self.sources = {name: source(self.logger, **kwargs) + for name, source, kwargs in self.config.sources()} + + self.logger.info("initializing operators") + self.operators = {name: operator(self.logger, self.es, self.blacklist, **kwargs) + for name, operator, kwargs in self.config.operators()} + + self.logger.info("initializing notifiers") + #self.notifiers = {name: operator(**kwargs) + # for name, operator, kwargs in self.config.notifiers()} + except Exception as e: + # Error loading elasticsearch. + self.logger.error(e) + self.logger.debug(traceback.print_exc()) + sys.exit(1) + + + def run(self): + """Run once, or forever, depending on config.""" + if self.config.daemon(): + self.logger.info("Running forever, in a loop") + self.run_forever() + else: + self.logger.info("Running once, to completion") + self.run_once() + + + def run_once(self): + """Run each source once, passing artifacts to each operator.""" + # Track some statistics about artifacts in a summary object. + summary = collections.Counter() + + for source in self.sources: + # Run the source to collect artifacts. + self.logger.info(f"Running source '{source}'") + try: + onions = self.sources[source].run() + if onions: + self.logger.info(f'Found hidden links') + else: + self.logger.info('No links found') + except Exception as e: + self.logger.error(e) + self.logger.error(traceback.print_exc()) + continue + + # Process artifacts with each operator. + for operator in self.operators: + self.logger.info(f"Processing found onions with operator '{operator}'") + try: + doc = self.operators[operator].process(onions) + # Save the source state. + self.es.save(doc) + except Exception as e: + self.logger.error(e) + self.logger.error(traceback.print_exc()) + continue + + + +# # Record stats and update the summary. +# types = artifact_types(doc.get('interestingKeywords')) +# summary.update(types) +# for artifact_type in types: +# self.logger.info(f'types[artifact_type]') + + # Log the summary. + self.logger.info(f"New artifacts: {dict(summary)}") + + + def run_forever(self): + """Run forever, sleeping for the configured interval between each run.""" + while True: + self.run_once() + + self.logger.info(f"Sleeping for {self.config.sleep()} seconds") + time.sleep(self.config.sleep()) + + +def artifact_types(artifact_list): + """Return a dictionary with counts of each artifact type.""" + types = {} + for artifact in artifact_list: + artifact_type = artifact.__class__.__name__.lower() + if artifact_type in types: + types[artifact_type] += 1 + else: + types[artifact_type] = 1 + + return types + + diff --git a/onioningestor/__main__.py b/onioningestor/__main__.py new file mode 100644 index 0000000..e00adb5 --- /dev/null +++ b/onioningestor/__main__.py @@ -0,0 +1,49 @@ +"""OnionScraper + +A Python3 application for indexing and scraping hidden services ElasticSearch + +Installation: + This application assumes you have python3 and pip3 installed. + + pip3 install -r requirements.txt + + +This software is provided subject to the MIT license stated below. +-------------------------------------------------- + MIT License + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +-------------------------------------------------- +""" +import argparse + +from onioningestor import Ingestor + + +# Load arguments from user +parser = argparse.ArgumentParser( + prog='onionscraper', + description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter) +parser.add_argument('-c', '--config',dest="configFile", required = True, help='Path to config file') +parser.add_argument("--log", dest="logLevel",default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the logging level, default is INFO") + +args = parser.parse_args() +app = Ingestor(args) + +app.run() diff --git a/onioningestor/config.py b/onioningestor/config.py new file mode 100644 index 0000000..096c6cf --- /dev/null +++ b/onioningestor/config.py @@ -0,0 +1,172 @@ +import io +import importlib +import traceback + +import yaml + +from pathlib import Path + +SOURCE = 'onioningestor.sources' +OPERATOR = 'onioningestor.operators' + +INTERNAL_OPTIONS = [ + 'saved_state', + 'module', + 'credentials', +] + +ARTIFACT_TYPES = 'artifact_types' +FILTER_STRING = 'filter' +ALLOWED_SOURCES = 'allowed_sources' +NAME = 'name' + + +class Config: + """Config read/write operations, and convenience methods.""" + def __init__(self, filename, logger): + """Read a config file.""" + self.logger = logger + self.filename = filename + with io.open(self.filename, 'r') as f: + try: + self.logger.info("Loading config file") + self.config = yaml.safe_load(f.read()) + except yaml.error.YAMLError: + self.logger.error("YAML error in config") + + + @staticmethod + def _load_plugin(plugin_type, plugin): + """Returns plugin class or raises an exception. + :raises: threatingestor.exceptions.PluginError + """ + try: + module = importlib.import_module('.'.join([plugin_type, plugin])) + return module.Plugin + except Exception as e: + print(e) + print(traceback.print_exc()) + + def daemon(self): + """Returns boolean, are we daemonizing?""" + return self.config['general']['daemon'] + + + def elasticsearch(self): + """Returns elasticsaerch config""" + return self.config['general']['elasticsearch'] + + + def sleep(self): + """Returns number of seconds to sleep between iterations, if daemonizing.""" + return self.config['general']['sleep'] + + def blacklist(self): + return self.config['general']['blacklist'].split(',') + +# def onionscanner(self): +# """Returns onionscanner config dict""" +# screenshots = self.config['onionscanner'].pop('screenshots_path', None) +# if screenshots: +# self.config['onionscanner']['screenshots_path'] = Path(screenshots) +# else: +# self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots' +# blacklist = self.config['onionscanner'].pop('blacklist', None) +# if blacklist: +# self.config['onionscanner']['blacklist'] = blacklist.split(',') +# interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None) +# if interestingKeywords: +# self.config['onionscanner']['interestingKeywords'] = blacklist.split(',') +# return self.config['onionscanner'] + + + def notifiers(self): + """Returns notifiers config dictionary.""" + return self.config.get('notifiers', {}) + + + def logging(self): + """Returns logging config dictionary.""" + return self.config.get('logging', {}) + + + def credentials(self, credential_name): + """Return a dictionary with the specified credentials.""" + for credential in self.config['credentials']: + for key, value in credential.items(): + if key == NAME and value == credential_name: + return credential + return {} + + + def sources(self): + """Return a list of (name, Source class, {kwargs}) tuples. + :raises: threatingestor.exceptions.PluginError + """ + sources = [] + + for source in self.config['sources']: + kwargs = {} + for key, value in source.items(): + if key not in INTERNAL_OPTIONS: + kwargs[key] = value + + elif key == 'credentials': + # Grab these named credentials + credential_name = value + for credential_key, credential_value in self.credentials(credential_name).items(): + if credential_key != NAME: + kwargs[credential_key] = credential_value + + # load and initialize the plugin + self.logger.info(f"Found source '{source[NAME]}'") + sources.append((source[NAME], self._load_plugin(SOURCE, source['module']), kwargs)) + + self.logger.info(f"Found {len(sources)} total sources") + return sources + + + def operators(self): + """Return a list of (name, Operator class, {kwargs}) tuples. + :raises: threatingestor.exceptions.PluginError + """ + operators = [] + for operator in self.config['operators']: + kwargs = {} + for key, value in operator.items(): + if key not in INTERNAL_OPTIONS: + if key == ARTIFACT_TYPES: + # parse out special artifact_types option + artifact_types = [] + for artifact in value: + try: + artifact_types.append(threatingestor.artifacts.STRING_MAP[artifact.lower().strip()]) + except KeyError: + # ignore invalid artifact types + pass + kwargs[key] = artifact_types + + elif key == FILTER_STRING: + # pass in special filter_string option + kwargs['filter_string'] = value + + elif key == NAME: + # exclude name key from operator kwargs, since it's not used + pass + + else: + kwargs[key] = value + + elif key == 'credentials': + # Grab these named credentials + credential_name = value + for credential_key, credential_value in self.credentials(credential_name).items(): + if credential_key != NAME: + kwargs[credential_key] = credential_value + + # load and initialize the plugin + self.logger.info(f"Found operator '{operator[NAME]}'") + operators.append((operator[NAME], self._load_plugin(OPERATOR, operator['module']), kwargs)) + + self.logger.info(f"Found {len(operators)} total operators") + return operators diff --git a/onioningestor/dbhandler.py b/onioningestor/dbhandler.py new file mode 100644 index 0000000..3de21e2 --- /dev/null +++ b/onioningestor/dbhandler.py @@ -0,0 +1,75 @@ +import sys +import traceback + +from elasticsearch import Elasticsearch, helpers + +class DbHandlerElasticSearch: + def __init__(self, config, logger): + self.logger = logger + self.logger.info('Creating Elasticsearch mapping') + self.config = config + self.mapping = ''' + { + "mappings": { + "_doc": { + "properties": { + "hiddenService": { + "type": "text" + }, + "blacklist": { + "type": "keyword" + }, + "monitor": { + "type": "boolean" + }, + "simple-html": { + "type": "nested", + "properties": { + "HTML": { + "type": "long" + }, + "title": { + "type": "text" + }, + "language": { + "type": "text" + }, + "status":{ + "type":"text" + }, + "date-indexed": { + "type": "date" + } + } + } + } + } + } + } + ''' + self.index = self.config['index'] + try: + self.es = Elasticsearch([{ + 'host':self.config['host'], + 'port':self.config['port']}]) + self.es.indices.create( + index=self.index, + body=self.mapping, + ignore=400) + except Exception as e: + self.logger.error(e) + self.logger.error(traceback.format_exc()) + sys.exit(0) + + def count(self): + self.es.indices.refresh(self.index) + status = self.es.count(index=self.index) + if status['_shards']['successful'] == 1: + self.logger.info('Successful') + self.logger.info('Count:%d',status['count']) + else: + self.logger.error(status) + + def save(self, doc): + self.es.index(index=self.index,body=doc) + self.count() diff --git a/onioningestor/loghandler.py b/onioningestor/loghandler.py new file mode 100644 index 0000000..1189b4e --- /dev/null +++ b/onioningestor/loghandler.py @@ -0,0 +1,33 @@ +import os +import logging +from pathlib import Path + +class LoggerHandler(): + def __init__(self, level): + self.level = getattr(logging, level) + self.logger = logging.getLogger("OnionScraper") + self.logger.setLevel(self.level) + + # create console handler and set level to debug + ch = logging.StreamHandler() + ch.setLevel(self.level) + + # create file logging + logFile = Path(__file__).parents[1] + logging_path = os.path.join(logFile, "info.log") + fh = logging.FileHandler(logging_path) + + # create formatter + formatter = logging.Formatter('[%(asctime)s] - %(name)s - %(levelname)s - %(message)s',datefmt='%a, %d %b %Y %H:%M:%S') + formatter_console = logging.Formatter('[%(asctime)s] - %(levelname)s - %(message)s',datefmt='%d %b %Y %H:%M:%S') + # add formatter to ch + ch.setFormatter(formatter_console) + fh.setFormatter(formatter) + # add ch to logger + self.logger.addHandler(ch) #added logging into console + self.logger.addHandler(fh) #added logging into file + + def start_logging(self): + self.logger.info('Starting OnionScraper') + return self.logger + diff --git a/onioningestor/operators/__init__.py b/onioningestor/operators/__init__.py new file mode 100644 index 0000000..5c3ed5c --- /dev/null +++ b/onioningestor/operators/__init__.py @@ -0,0 +1,95 @@ +import re +import sys +import json + + +class Operator: + """Base class for all Operator plugins. + + Note: This is an abstract class. You must extend ``__init__`` and call + ``super`` to ensure this class's constructor is called. You must override + ``handle_artifact`` with the same signature. You may define additional + ``handle_{artifact_type}`` methods as needed (see the threatkb operator for + an example) - these methods are purely convention, and are not required. + + When adding additional methods to child classes, consider prefixing the + method name with an underscore to denote a ``_private_method``. Do not + override other existing methods from this class. + """ + def __init__(self, logger, elasticsearch, allowed_sources=None): + """Override this constructor in child classes. + + The arguments above (artifact_types, filter_string, allowed_sources) + should be accepted explicity as above, in all child classes. + + Additional arguments should be added: url, auth, etc, whatever is + needed to set up the object. + + Each operator should default self.artifact_types to a list of Artifacts + supported by the plugin, and allow passing in artifact_types to + overwrite that default. + + Example: + + >>> self.artifact_types = artifact_types or [ + ... artifacts.IPAddress, + ... artifacts.Domain, + ... ] + + It's recommended to call this __init__ method via super from all child + classes. Remember to do so *before* setting any default artifact_types. + """ + self.logger = logger + self.blacklist = re.compile('|'.join([re.escape(word) for word in allowed_sources]), re.IGNORECASE) + self.es = elasticsearch + + def response(self, content, onion, operator_name): + """ + status: success/failure + content: dict + onion: str + return: dict + """ + try: + return {operator_name: json.loads(str(content)), 'hiddenService': onion} + except json.decoder.JSONDecodeError as e: + self.logger.info('JosnDecode Error') + return {operator_name: content, 'hiddenService': onion} + #except TypeError: + # return {operator_name: None, 'hiddenService': onion} + except Exception as e: + self.logger.error(e) + + def handle_onion(self, url): + """Override with the same signature. + + :param artifact: A single ``Artifact`` object. + :returns: None (always ignored) + """ + raise NotImplementedError() + + + def _onion_is_allowed(self, response, type='URL'): + """Returns True if this is allowed by this plugin's filters.""" + # Must be in allowed_sources, if set. + if type == 'URL': + print(response) + blacklist = self.blacklist.findall(response['hiddenService']) + elif type == 'HTML': + response['simple-html'].pop('status') + response['simple-html']['status'] = 'blocked' + blacklist = self.blacklist.findall(response['simple-html']['HTML']) + if blacklist: + self.es.save(response) + return False + return True + + + def process(self, onions): + """Process all applicable onions.""" + for onion in onions: + if self._onion_is_allowed( + self.response({'status':'blocked'},onion.url,'regex-blacklist'), + type='URL'): + self.handle_onion(onion.url) + diff --git a/onioningestor/operators/html.py b/onioningestor/operators/html.py new file mode 100644 index 0000000..8b6d415 --- /dev/null +++ b/onioningestor/operators/html.py @@ -0,0 +1,99 @@ +import time +import json +import traceback +from datetime import datetime as dt +from json.decoder import JSONDecodeError + +import requests + +from bs4 import BeautifulSoup + +from langdetect import detect + +from stem.control import Controller +from stem import Signal + +from onioningestor.operators import Operator + + +class Plugin(Operator): + """Simple-html + This plugin collects HTML code from onion link + """ + + def __init__(self, logger, elasticsearch, allowed_sources, **kwargs): + super(Plugin, self).__init__(logger, elasticsearch, allowed_sources) + self.plugin_name = 'simple-html' + self.logger.info(f"Initializing {self.plugin_name}") + + self.timeout = int(kwargs['timeout']) + self.retries = int(kwargs['retries']) + + self.proxy = kwargs['socks5'] + self.torControl = kwargs['TorController'] + self.headers ={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language':'en-US,en;q=0.5', + 'DNT': '1', 'Connection': + 'keep-alive', + 'Upgrade-Insecure-Requests': '1'} + + def get_tor_session(self): + try: + s = requests.session() + s.proxies = self.proxy + s.headers.update(self.headers) + except Exception as e: + self.logger.error(e) + self.logger.debug(traceback.print_exc()) + return s + + def renew_connection(self): + with Controller.from_port(port = self.torControl['port']) as controller: + # Now we switch TOR identities to make sure we have a good connection + self.logger.info('Getting new Tor IP') + # authenticate to our local TOR controller + controller.authenticate(self.torControl['password']) + # send the signal for a new identity + controller.signal(Signal.NEWNYM) + # wait for the new identity to be initialized + time.sleep(controller.get_newnym_wait()) + session = self.get_tor_session() + self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}") + + def run_sessions(self, onion): + retry = 0 + result = None + while True: + try: + url = 'http://'+onion + self.logger.info(url) + content = self.get_tor_session().get(url) + if content.status_code == 200: + result = content.text + if result: + html = BeautifulSoup(result,features="lxml") + index = {'HTML':result,'title':html.title.text,'language':detect(html.text),'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z','status':'success'} + return self.response(index, onion, self.plugin_name) + except requests.exceptions.ConnectionError as connection_error: + self.logger.error(f'Failed connecting to http://{url}') + self.logger.debug(connection_error) + except Exception as e: + self.logger.error(e) + self.logger.debug(traceback.print_exc()) + + self.logger.info('[x] No results found retrying ...') + retry += 1 + self.renew_connection() + if retry > self.retries: + self.logger.error('[x] Max retries exceeded') + return self.response({'status':"failure"}, onion, self.plugin_name) + + def handle_onion(self, onion): + content = self.run_sessions(onion) + print(content) + if content[self.plugin_name]['status'] == 'success': + if self._onion_is_allowed(content): + self.es.save(content) + diff --git a/onioningestor/operators/onionscan.py b/onioningestor/operators/onionscan.py new file mode 100644 index 0000000..833e72b --- /dev/null +++ b/onioningestor/operators/onionscan.py @@ -0,0 +1,264 @@ +import re +import os +import sys +import json +import time +import random +import traceback +import subprocess +from uuid import uuid4 +from pathlib import Path +from datetime import datetime as dt +from json.decoder import JSONDecodeError +from concurrent.futures import ProcessPoolExecutor +from threading import Timer + +import requests + +from stem.control import Controller +from stem import Signal + +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary + +from onionscraper.operators import Operator + +class Plugin(Operator): + """OnionScraper main work logic. + + Handles reading the config file, calling sources, maintaining state and + sending artifacts to operators. + """ + def __init__(self, logger, **kwargs): + self.logger = logger + self.logger.info('Initializing OnionScanner') + screenshots = kwargs.pop('screenshots_path', None) + if screenshots: + self.screenshots = Path(screenshots) + else: + self.screenshots = Path(__file__).parents[1]/'screenshots' + self.onionscan = kwargs['binpath'] + self.timeout = int(kwargs['timeout']) + self.proxy = kwargs['socks5'] + self.torControl = kwargs['TorController'] + self.retries = int(kwargs['retries']) + self.headers ={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language':'en-US,en;q=0.5', + 'DNT': '1', 'Connection': + 'keep-alive', + 'Upgrade-Insecure-Requests': '1'} + + + blacklist = kwargs['blacklist'].split(',') + self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE) + keywords = kwargs['interestingKeywords'].split(',') + self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE) + self.session = self.get_tor_session() + + def response(self, status, content, onion): + """ + status: success/failure + content: dict + onion: str + return: dict + """ + return {'status': status, 'data': content, 'onion': onion} + + def parseDoc(self, data): + data['onionscan'].pop('simpleReport', None) + crawls = data['onionscan'].pop('crawls', None) + hiddenService = data['onionscan'].pop('hiddenService', None) + data['onionscan']['crawls'] = [*crawls] + data['hiddenService'] = hiddenService + for onion in crawls.keys(): + print(onion) + with open('/home/tony/Projects/OnionScraper_v2/onion_master_list.txt', 'a') as fp: + fp.write("%s\n" % onion) + #q.enqueue(self.crawl, onion) + #with open('test.json', 'w', encoding='utf-8') as f: + # json.dump(data, f, ensure_ascii=False, indent=4) + return data + + def format_directory(self, directory): + d = dt.now() + year = str(d.year) + month = str(d.month) + # prefix month and day with "0" if it is only one digit + if len(month) < 2: + month = "0" + month + day = str(d.day) + if len(day) < 2: + day = "0" + day + save_path = directory/year/month/day + if not os.path.isdir(save_path): + self.logger.info("[*] Creating directory to save screenshots") + os.makedirs(save_path) + + return save_path + + def take_screenshot(self, save_path, onion): + binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver') + fp = webdriver.FirefoxProfile() + fp.set_preference('network.proxy.type', 1) + fp.set_preference('network.proxy.socks', '127.0.0.1') + fp.set_preference('network.proxy.socks_port', 9050) + fp.set_preference('network.proxy.socks_remote_dns', True) + + options = Options() + options.headless = True + driver = webdriver.Firefox( + executable_path='/home/tony/Projects/OnionScraper/geckodriver', + options=options, + firefox_profile=fp) + url = 'http://' + onion + driver.get(url) + uid = str(uuid4()).split('-')[0] + filename = f"{onion}_screenshot_{uid}.png" + f_name = f"{save_path}/{filename}" + driver.save_screenshot(f_name) + + driver.quit() + + if os.path.isfile(f_name): + self.logger.info(f'[*] Screenshot was taken. {f_name}') + dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z' + result = {'dateScreenshoted':dateScreenshoted,'filename':filename} + return self.response("success",result,onion) + else: + self.logger.error('[x] Unable to take screenshot') + return self.response("failure",None,onion) + + + + def get_tor_session(self): + try: + s = requests.session() + s.proxies = self.proxy + s.headers.update(self.headers) + except Exception as e: + self.logger.error(e) + self.logger.debug(traceback.print_exc()) + return s + + # signal TOR for a new connection + def renew_connection(self): + with Controller.from_port(port = self.torControl['port']) as controller: + # Now we switch TOR identities to make sure we have a good connection + self.logger.info('Getting new Tor IP') + # authenticate to our local TOR controller + controller.authenticate(self.torControl['password']) + # send the signal for a new identity + controller.signal(Signal.NEWNYM) + # wait for the new identity to be initialized + time.sleep(controller.get_newnym_wait()) + session = self.get_tor_session() + self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}") + + def handle_timeout(self, process, onion): + # + # Handle a timeout from the onionscan process. + # + + try: + # kill the onionscan process + process.kill() + self.logger.info("[!!!] Killed the onionscan process.") + except: + pass + self.renew_connection() + return + + def run_sessions(self, onion): + retry = 0 + result = None + while True: + try: + url = 'http://'+onion + self.logger.info(url) + content = self.session.get(url) + if content.status_code == 200: + result = content.json() + except JSONDecodeError as e: + self.logger.debug(f'JSONDecodeError {e}') + result = content.text + except Exception as e: + self.logger.error(e) + self.logger.debug(traceback.print_exc()) + finally: + if result: + return self.response("success",result,onion) + else: + self.logger.info('[x] No results found retrying ...') + retry += 1 + self.renew_connection() + if retry > self.retries: + self.logger.error('[x] Max retries exceeded') + return self.response("failure",None, onion) + + def run_onionscan(self, onion): + self.logger.info("[*] Running onionscan on %s", onion) + + # fire up onionscan + process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE) + + # start the timer and let it run till timeout minutes + process_timer = Timer(300,self.handle_timeout,args=[process,onion]) + process_timer.start() + + # wait for the onion scan results + stdout = process.communicate()[0] + + # we have received valid results so we can kill the timer + if process_timer.is_alive(): + process_timer.cancel() + try: + return self.response("success",json.loads(stdout),onion) + except json.decoder.JSONDecodeError: + pass + + self.logger.info("[!!!] Process timed out for %s", onion) + + return self.response("failure",None, onion) + + def handle_onion(self, onion_tuple): + onion = onion_tuple.url + self.logger.info(f'Processing {onion} with onionscan') + try: + blacklist_URL = self.blacklist.search(onion) + if blacklist_URL: + self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}") + else: + self.logger.debug("[*] URL blacklist test: PASSED") + results = self.run_onionscan(onion) + if results['status'] == 'success':# and results['data']['webDetected'] == 'true': + content = self.run_sessions(onion) + if content['status'] == 'success': + blacklist_CONTENT = self.blacklist.search(content['data']) + if blacklist_CONTENT: + self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}") + else: + self.logger.debug("[*] CONTENT blacklist test: PASSED") + screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion) + self.logger.info("Indexing!") + doc = { + 'onionscan':json.loads(results['data']), + 'html':content['data'], + 'screenshots':screenshot['data'], + 'interestingKeywords':self.interestingKeywords.findall(content['data']) + } + return self.parseDoc(doc) + + else: + self.logger.info(f"[x] hidden service {onion} is not active") + except Exception as e: + self.logger.error(e) + self.logger.error(traceback.print_exc()) + finally: + pass + #sys.exit(0) + + + diff --git a/onioningestor/operators/yara.py b/onioningestor/operators/yara.py new file mode 100644 index 0000000..794c093 --- /dev/null +++ b/onioningestor/operators/yara.py @@ -0,0 +1,15 @@ + +from onionscraper.operators import Operator + +class Plugin(Operator): + """Operator for output to flat CSV file.""" + def __init__(self, filename, base_score): + """CSV operator.""" + self.filename = filename + + #super(Plugin, self).__init__(artifact_types, filter_string, allowed_sources) + + + def handle_artifact(self, artifact): + """Operate on a single artifact.""" + pass diff --git a/onioningestor/sources/__init__.py b/onioningestor/sources/__init__.py new file mode 100644 index 0000000..14bb4e4 --- /dev/null +++ b/onioningestor/sources/__init__.py @@ -0,0 +1,41 @@ +from collections import namedtuple + +class Source(object): + """Base class for all Source plugins. + Note: This is an abstract class. You must override ``__init__`` and ``run`` + in child classes. You should not override ``process_element``. When adding + additional methods to child classes, consider prefixing the method name + with an underscore to denote a ``_private_method``. + """ + def __init__(self, name, *args, **kwargs): + """Override this constructor in child classes. + The first argument must always be ``name``. + Other argumentss should be url, auth, etc, whatever is needed to set + up the object. + """ + self.onion = namedtuple('onion', ['url','source','type']) + + + def run(self): + """Run and return ``(saved_state, list(Artifact))``. + Override this method in child classes. + The method signature and return values must remain consistent. + The method should attempt to pick up where we left off using + ``saved_state``, if supported. If ``saved_state`` is ``None``, you can + assume this is a first run. If state is maintained by the remote + resource (e.g. as it is with SQS), ``saved_state`` should always be + ``None``. + """ + raise NotImplementedError() + + + def process_element(self, content, reference_link, include_nonobfuscated=False): + """Take a single source content/url and return a list of Artifacts. + This is the main work block of Source plugins, which handles + IOC extraction and artifact creation. + :param content: String content to extract from. + :param reference_link: Reference link to attach to all artifacts. + :param include_nonobfuscated: Include non-defanged URLs in output? + """ + logger.debug(f"Processing in source '{self.name}'") + diff --git a/onioningestor/sources/gist.py b/onioningestor/sources/gist.py new file mode 100644 index 0000000..15e2049 --- /dev/null +++ b/onioningestor/sources/gist.py @@ -0,0 +1,153 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +__author__ = 'Andrey Glauzer' +__license__ = "MIT" +__version__ = "1.0.1" +__maintainer__ = "Andrey Glauzer" +__status__ = "Development" + +import requests +import json +import re +import re +import urllib.parse +from random import choice +import time +from bs4 import BeautifulSoup + + +from onionscraper.sources import Source + + +class Plugin(Source): + + def __init__(self, logger, name, url): + self.logger = logger + self.name = name + self.url = url + self.desktop_agents = [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0'] + super().__init__(self) + + + def run(self): + self.logger.info('Starting Gist Scraper') + self.cookies() + self.pagination() + self.scraping() + return self.raw() + + @property + def random_headers(self): + return { + 'User-Agent': choice(self.desktop_agents), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + } + + def cookies(self): + + self.logger.info('Setting GIST cookies') + + with requests.Session() as self.session: + self.headers = self.random_headers + + request = self.session.get(self.url, headers=self.headers) + + if request.status_code == 200: + pass + else: + self.logger.error('No Response from GIST') + + def pagination(self): + request = self.session.get( + f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers) + self.soup = BeautifulSoup(request.content, features="lxml") + + pages = [] + self.urls = [self.url] + try: + for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'): + pages.append(pagination.get_text()) + except: + pages = False + + if pages: + cont = 2 + while cont <= 1: # int(pages[-2]): + cont += 1 + full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}" + self.urls.append(full_url) + + def scraping(self): + url = [] + for inurl in self.urls: + self.logger.info(f"Connecting to {inurl}") + time.sleep(5) + request = self.session.get(inurl, headers=self.headers) + + if request.status_code == 200: + soup = BeautifulSoup(request.content, features="lxml") + for code in soup.findAll('div', {'class': 'gist-snippet'}): + if '.onion' in code.get_text().lower(): + for raw in code.findAll('a', {'class': 'link-overlay'}): + try: + url.append(raw['href']) + except: + pass + self.urls_raw = [] + for get in url: + self.logger.info(f"Connecting to {get}") + time.sleep(5) + try: + request = self.session.get(get, headers=self.headers) + + if request.status_code == 200: + soup = BeautifulSoup(request.content, features="lxml") + + for raw in soup.findAll('a', {'class': 'btn btn-sm'}): + try: + gist_url = f"https://gist.githubusercontent.com{raw['href']}" + + self.urls_raw.append(gist_url) + + except: + pass + except(requests.exceptions.ConnectionError, + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ReadTimeout, + requests.exceptions.InvalidURL) as e: + self.logger.error( + f"I was unable to connect to the url, because an error occurred.\n{e}") + pass + + def raw(self): + self.logger.info('Performing replaces and regex. WAIT...') + itens = [] + onions = [] + for raw in self.urls_raw: + if '.txt' in raw.lower() \ + or '.csv' in raw.lower(): + time.sleep(5) + request = self.session.get(raw, headers=self.headers) + self.soup = BeautifulSoup(request.content, features="lxml") + for pre in self.soup.findAll('body'): + list = pre.get_text().split('\n') + itens.extend(list) + + regex = re.compile( + "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") + + for lines in itens: + rurls = lines \ + .replace('\xad', '') \ + .replace('\n', '') \ + .replace("http://", '') \ + .replace("https://", '') \ + .replace("www.", "") + + url = regex.match(rurls) + + if url is not None: + onions.append(self.onion(url=url.group(), source='gist', type='domain')) + return onions diff --git a/onioningestor/sources/gmail.py b/onioningestor/sources/gmail.py new file mode 100644 index 0000000..a0ddf3f --- /dev/null +++ b/onioningestor/sources/gmail.py @@ -0,0 +1,153 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +__author__ = 'Daneiele Perera' +__license__ = "MIT" +__version__ = "1.0.1" +__maintainer__ = "Daniele Perera" +__status__ = "Development" + +import requests +import json +import re +import re +import urllib.parse +from random import choice +import time +from bs4 import BeautifulSoup + + +from onionscraper.sources import Source + + +class Plugin(Source): + + def __init__(self, logger, name, url): + self.logger = logger + self.name = name + self.url = url + self.desktop_agents = [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0'] + super().__init__(self) + + + def run(self): + self.logger.info('Starting Gist Scraper') + self.cookies() + self.pagination() + self.scraping() + return self.raw() + + @property + def random_headers(self): + return { + 'User-Agent': choice(self.desktop_agents), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + } + + def cookies(self): + + self.logger.info('Setting GIST cookies') + + with requests.Session() as self.session: + self.headers = self.random_headers + + request = self.session.get(self.url, headers=self.headers) + + if request.status_code == 200: + pass + else: + self.logger.error('No Response from GIST') + + def pagination(self): + request = self.session.get( + f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers) + self.soup = BeautifulSoup(request.content, features="lxml") + + pages = [] + self.urls = [self.url] + try: + for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'): + pages.append(pagination.get_text()) + except: + pages = False + + if pages: + cont = 2 + while cont <= 1: # int(pages[-2]): + cont += 1 + full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}" + self.urls.append(full_url) + + def scraping(self): + url = [] + for inurl in self.urls: + self.logger.info(f"Connecting to {inurl}") + time.sleep(5) + request = self.session.get(inurl, headers=self.headers) + + if request.status_code == 200: + soup = BeautifulSoup(request.content, features="lxml") + for code in soup.findAll('div', {'class': 'gist-snippet'}): + if '.onion' in code.get_text().lower(): + for raw in code.findAll('a', {'class': 'link-overlay'}): + try: + url.append(raw['href']) + except: + pass + self.urls_raw = [] + for get in url: + self.logger.info(f"Connecting to {get}") + time.sleep(5) + try: + request = self.session.get(get, headers=self.headers) + + if request.status_code == 200: + soup = BeautifulSoup(request.content, features="lxml") + + for raw in soup.findAll('a', {'class': 'btn btn-sm'}): + try: + gist_url = f"https://gist.githubusercontent.com{raw['href']}" + + self.urls_raw.append(gist_url) + + except: + pass + except(requests.exceptions.ConnectionError, + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ReadTimeout, + requests.exceptions.InvalidURL) as e: + self.logger.error( + f"I was unable to connect to the url, because an error occurred.\n{e}") + pass + + def raw(self): + self.logger.info('Performing replaces and regex. WAIT...') + itens = [] + onions = [] + for raw in self.urls_raw: + if '.txt' in raw.lower() \ + or '.csv' in raw.lower(): + time.sleep(5) + request = self.session.get(raw, headers=self.headers) + self.soup = BeautifulSoup(request.content, features="lxml") + for pre in self.soup.findAll('body'): + list = pre.get_text().split('\n') + itens.extend(list) + + regex = re.compile( + "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") + + for lines in itens: + rurls = lines \ + .replace('\xad', '') \ + .replace('\n', '') \ + .replace("http://", '') \ + .replace("https://", '') \ + .replace("www.", "") + + url = regex.match(rurls) + + if url is not None: + onions.append(self.onion(url=url.group(), source='gist', type='domain')) + return onions diff --git a/onioningestor/sources/reddit.py b/onioningestor/sources/reddit.py new file mode 100644 index 0000000..8dd2e4d --- /dev/null +++ b/onioningestor/sources/reddit.py @@ -0,0 +1,120 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +__author__ = 'Andrey Glauzer' +__license__ = "MIT" +__version__ = "1.0.1" +__maintainer__ = "Andrey Glauzer" +__status__ = "Development" + +import requests +import json +import re +import logging +import re +import urllib.parse +from random import choice +from bs4 import BeautifulSoup + + +class Reddit: + def __init__(self): + self.session = requests.session() + + self.source = 'Reddit' + + self.url = 'https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000' + self.desktop_agents = [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0'] + + @property + def random_headers(self): + return { + 'User-Agent': choice(self.desktop_agents), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + } + + @property + def start(self): + self.reddit_json() + + def reddit_json(self): + print('Getting Reddit API information') + onionurl = [] + try: + request = self.session.get(self.url, headers=self.random_headers) + + loaded_json = json.loads(request.content) + + print( + 'Filtering the URLs that have the word .onion in the text') + for data in loaded_json['data']: + reddit_url = 'https://www.reddit.com{}'.format( + data['permalink']) + try: + request = self.session.get( + reddit_url, headers=self.random_headers) + soup = BeautifulSoup(request.content, features="lxml") + + for raw in soup.findAll('a', {'rel': 'nofollow'}): + if 'https://' in raw['href']: + raw_text = self.raw(url=raw['href']) + if raw_text is not None: + print( + 'Applying REGEX. Wait...') + regex = re.compile( + "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") + + for lines in raw_text.split('\n'): + rurls = lines \ + .replace('\xad', '') \ + .replace('\n', '') \ + .replace("http://", '') \ + .replace("https://", '') \ + .replace(r'\s', '') \ + .replace('\t', '') + + xurl = regex.match(rurls) + if xurl is not None: + onionurl.append(xurl.group()) + + except(requests.exceptions.ConnectionError, + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ReadTimeout, + requests.exceptions.InvalidURL) as e: + print( + 'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e)) + + except(requests.exceptions.ConnectionError, + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ReadTimeout, + requests.exceptions.InvalidURL) as e: + print( + 'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e)) + + return onionurl + + def raw(self, url): + try: + if url is not None: + request = self.session.get(url, headers=self.random_headers) + print( + 'Connecting in {url} - {status}'.format(url=url, status=request.status_code)) + + if request.status_code == 200: + + soup = BeautifulSoup(request.content, features="lxml") + for s in soup(['script', 'style']): + s.decompose() + + return ' '.join(soup.stripped_strings) + + except (requests.exceptions.ConnectionError, + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ReadTimeout, + requests.exceptions.TooManyRedirects) as e: + pass + +if __name__ == '__main__': + app = Reddit() + app.start diff --git a/onioningestor/sources/simplefile.py b/onioningestor/sources/simplefile.py new file mode 100644 index 0000000..b5656d7 --- /dev/null +++ b/onioningestor/sources/simplefile.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +__author__ = 'Andrey Glauzer' +__license__ = "MIT" +__version__ = "1.0.1" +__maintainer__ = "Andrey Glauzer" +__status__ = "Development" + +import requests +from pathlib import Path + +from onioningestor.sources import Source + + +class Plugin(Source): + + def __init__(self, logger, name, filename): + self.logger = logger + self.name = name + self.filename = filename + super().__init__(self) + + + def run(self): + filepath = Path(__file__).parents[2]/self.filename + with open(filepath, 'r') as fp: + lines = fp.read().splitlines() + for onion in lines: + yield self.onion(url=onion,source='simple-file',type='domain') +