renamed the package and fixed some issues

4 years ago · 8258056bef
parent 23dd01a983
commit 8258056bef
14 changed files with 1431 additions and 0 deletions
--- a/onioningestor/init.py
+++ b/onioningestor/init.py
@ -0,0 +1,131 @@
 import sys
 import time
 import traceback
 import collections
 from . import config
 from . import dbhandler
 from . import loghandler
 class Ingestor:
    """ThreatIngestor main work logic.
    Handles reading the config file, calling sources, maintaining state, and
    sending artifacts to operators.
    """
    def __init__(self, args):
        # Load logger
        log = loghandler.LoggerHandler(args.logLevel)
        self.logger = log.start_logging()
        # Load config
        self.config = config.Config(args.configFile, self.logger)
        self.blacklist = self.config.blacklist()
        # Load Elasticsearch.
        try:
            self.es = dbhandler.DbHandlerElasticSearch(
                    self.config.elasticsearch(),
                    self.logger)
        except Exception as e:
            # Error loading elasticsearch.
            self.logger.error(e)
            self.logger.debug(traceback.print_exc())
            sys.exit(1)
        # Instantiate plugins.
        try:
            self.logger.info("Initializing sources")
            self.sources = {name: source(self.logger, **kwargs)
                            for name, source, kwargs in self.config.sources()}
            self.logger.info("initializing operators")
            self.operators = {name: operator(self.logger, self.es, self.blacklist, **kwargs)
                              for name, operator, kwargs in self.config.operators()}
            self.logger.info("initializing notifiers")
            #self.notifiers = {name: operator(**kwargs)
            #                  for name, operator, kwargs in self.config.notifiers()}
        except Exception as e:
            # Error loading elasticsearch.
            self.logger.error(e)
            self.logger.debug(traceback.print_exc())
            sys.exit(1)
    def run(self):
        """Run once, or forever, depending on config."""
        if self.config.daemon():
            self.logger.info("Running forever, in a loop")
            self.run_forever()
        else:
            self.logger.info("Running once, to completion")
            self.run_once()
    def run_once(self):
        """Run each source once, passing artifacts to each operator."""
        # Track some statistics about artifacts in a summary object.
        summary = collections.Counter()
        for source in self.sources:
            # Run the source to collect artifacts.
            self.logger.info(f"Running source '{source}'")
            try:
                onions = self.sources[source].run()
                if onions:
                    self.logger.info(f'Found hidden links')
                else:
                    self.logger.info('No links found')
            except Exception as e:
                self.logger.error(e)
                self.logger.error(traceback.print_exc())
                continue
            # Process artifacts with each operator.
            for operator in self.operators:
                self.logger.info(f"Processing found onions with operator '{operator}'")
                try:
                    doc = self.operators[operator].process(onions)
                    # Save the source state.
                    self.es.save(doc)
                except Exception as e:
                    self.logger.error(e)
                    self.logger.error(traceback.print_exc())
                    continue
 #            # Record stats and update the summary.
 #            types = artifact_types(doc.get('interestingKeywords'))
 #            summary.update(types)
 #            for artifact_type in types:
 #                self.logger.info(f'types[artifact_type]')
        # Log the summary.
        self.logger.info(f"New artifacts: {dict(summary)}")
    def run_forever(self):
        """Run forever, sleeping for the configured interval between each run."""
        while True:
            self.run_once()
            self.logger.info(f"Sleeping for {self.config.sleep()} seconds")
            time.sleep(self.config.sleep())
 def artifact_types(artifact_list):
    """Return a dictionary with counts of each artifact type."""
    types = {}
    for artifact in artifact_list:
        artifact_type = artifact.__class__.__name__.lower()
        if artifact_type in types:
            types[artifact_type] += 1
        else:
            types[artifact_type] = 1
    return types
--- a/onioningestor/main.py
+++ b/onioningestor/main.py
@ -0,0 +1,49 @@
 """OnionScraper
 A Python3 application for indexing and scraping hidden services ElasticSearch
 Installation:
   This application assumes you have python3 and pip3 installed.
   pip3 install -r requirements.txt
 This software is provided subject to the MIT license stated below.
 --------------------------------------------------
        MIT License
        Permission is hereby granted, free of charge, to any person obtaining a copy
        of this software and associated documentation files (the "Software"), to deal
        in the Software without restriction, including without limitation the rights
        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
        copies of the Software, and to permit persons to whom the Software is
        furnished to do so, subject to the following conditions:
        The above copyright notice and this permission notice shall be included in all
        copies or substantial portions of the Software.
        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
        SOFTWARE.
 --------------------------------------------------
 """
 import argparse
 from onioningestor import Ingestor
 # Load arguments from user
 parser = argparse.ArgumentParser(
        prog='onionscraper',
        description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter)
 parser.add_argument('-c', '--config',dest="configFile", required = True, help='Path to config file')
 parser.add_argument("--log", dest="logLevel",default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the logging level, default is INFO")
 args = parser.parse_args()
 app = Ingestor(args)
 app.run()
--- a/onioningestor/config.py
+++ b/onioningestor/config.py
@ -0,0 +1,172 @@
 import io
 import importlib
 import traceback
 import yaml
 from pathlib import Path
 SOURCE = 'onioningestor.sources'
 OPERATOR = 'onioningestor.operators'
 INTERNAL_OPTIONS = [
    'saved_state',
    'module',
    'credentials',
 ]
 ARTIFACT_TYPES = 'artifact_types'
 FILTER_STRING = 'filter'
 ALLOWED_SOURCES = 'allowed_sources'
 NAME = 'name'
 class Config:
    """Config read/write operations, and convenience methods."""
    def __init__(self, filename, logger):
        """Read a config file."""
        self.logger = logger
        self.filename = filename
        with io.open(self.filename, 'r') as f:
            try:
                self.logger.info("Loading config file")
                self.config = yaml.safe_load(f.read())
            except yaml.error.YAMLError:
                self.logger.error("YAML error in config")
    @staticmethod
    def _load_plugin(plugin_type, plugin):
        """Returns plugin class or raises an exception.
        :raises: threatingestor.exceptions.PluginError
        """
        try:
            module = importlib.import_module('.'.join([plugin_type, plugin]))
            return module.Plugin
        except Exception as e:
            print(e)
            print(traceback.print_exc())
    def daemon(self):
        """Returns boolean, are we daemonizing?"""
        return self.config['general']['daemon']
    def elasticsearch(self):
        """Returns elasticsaerch config"""
        return self.config['general']['elasticsearch']
    def sleep(self):
        """Returns number of seconds to sleep between iterations, if daemonizing."""
        return self.config['general']['sleep']
    def blacklist(self):
        return self.config['general']['blacklist'].split(',')
 #    def onionscanner(self):
 #        """Returns onionscanner config dict"""
 #        screenshots = self.config['onionscanner'].pop('screenshots_path', None)
 #        if screenshots:
 #            self.config['onionscanner']['screenshots_path'] = Path(screenshots)
 #        else:
 #            self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots'
 #        blacklist = self.config['onionscanner'].pop('blacklist', None)
 #        if blacklist:
 #            self.config['onionscanner']['blacklist'] = blacklist.split(',')
 #        interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None)
 #        if interestingKeywords:
 #            self.config['onionscanner']['interestingKeywords'] = blacklist.split(',')
 #        return self.config['onionscanner']
    def notifiers(self):
        """Returns notifiers config dictionary."""
        return self.config.get('notifiers', {})
    def logging(self):
        """Returns logging config dictionary."""
        return self.config.get('logging', {})
    def credentials(self, credential_name):
        """Return a dictionary with the specified credentials."""
        for credential in self.config['credentials']:
            for key, value in credential.items():
                if key == NAME and value == credential_name:
                    return credential
        return {}
    def sources(self):
        """Return a list of (name, Source class, {kwargs}) tuples.
        :raises: threatingestor.exceptions.PluginError
        """
        sources = []
        for source in self.config['sources']:
            kwargs = {}
            for key, value in source.items():
                if key not in INTERNAL_OPTIONS:
                    kwargs[key] = value
                elif key == 'credentials':
                    # Grab these named credentials
                    credential_name = value
                    for credential_key, credential_value in self.credentials(credential_name).items():
                        if credential_key != NAME:
                            kwargs[credential_key] = credential_value
            # load and initialize the plugin
            self.logger.info(f"Found source '{source[NAME]}'")
            sources.append((source[NAME], self._load_plugin(SOURCE, source['module']), kwargs))
        self.logger.info(f"Found {len(sources)} total sources")
        return sources
    def operators(self):
        """Return a list of (name, Operator class, {kwargs}) tuples.
        :raises: threatingestor.exceptions.PluginError
        """
        operators = []
        for operator in self.config['operators']:
            kwargs = {}
            for key, value in operator.items():
                if key not in INTERNAL_OPTIONS:
                    if key == ARTIFACT_TYPES:
                        # parse out special artifact_types option
                        artifact_types = []
                        for artifact in value:
                            try:
                                artifact_types.append(threatingestor.artifacts.STRING_MAP[artifact.lower().strip()])
                            except KeyError:
                                # ignore invalid artifact types
                                pass
                        kwargs[key] = artifact_types
                    elif key == FILTER_STRING:
                        # pass in special filter_string option
                        kwargs['filter_string'] = value
                    elif key == NAME:
                        # exclude name key from operator kwargs, since it's not used
                        pass
                    else:
                        kwargs[key] = value
                elif key == 'credentials':
                    # Grab these named credentials
                    credential_name = value
                    for credential_key, credential_value in self.credentials(credential_name).items():
                        if credential_key != NAME:
                            kwargs[credential_key] = credential_value
            # load and initialize the plugin
            self.logger.info(f"Found operator '{operator[NAME]}'")
            operators.append((operator[NAME], self._load_plugin(OPERATOR, operator['module']), kwargs))
        self.logger.info(f"Found {len(operators)} total operators")
        return operators
--- a/onioningestor/dbhandler.py
+++ b/onioningestor/dbhandler.py
@ -0,0 +1,75 @@
 import sys
 import traceback
 from elasticsearch import Elasticsearch, helpers
 class DbHandlerElasticSearch:
    def __init__(self, config, logger):
        self.logger = logger
        self.logger.info('Creating Elasticsearch mapping')
        self.config = config
        self.mapping = '''
        {
          "mappings": {
            "_doc": {
              "properties": {
                "hiddenService": {
                  "type": "text"
                },
                "blacklist": {
                  "type": "keyword"
                },      
                "monitor": {
                  "type": "boolean"
                },
                "simple-html": {
                  "type": "nested",
                  "properties": {
                    "HTML": {
                      "type": "long"
                    },
                    "title": {
                      "type": "text"
                    },
                    "language": {
                      "type": "text"
                    },
                    "status":{
                      "type":"text"
                    },
                    "date-indexed": {
                      "type": "date"
                    }
                  }
                }
              }
            }
          }
        }
        '''
        self.index = self.config['index']
        try:
            self.es = Elasticsearch([{
                'host':self.config['host'],
                'port':self.config['port']}])
            self.es.indices.create(
                    index=self.index,
                    body=self.mapping,
                    ignore=400)
        except Exception as e:
            self.logger.error(e)
            self.logger.error(traceback.format_exc())
            sys.exit(0)
    def count(self):
        self.es.indices.refresh(self.index)
        status = self.es.count(index=self.index)
        if status['_shards']['successful'] == 1:
            self.logger.info('Successful')
            self.logger.info('Count:%d',status['count'])
        else:
            self.logger.error(status)
    def save(self, doc):
        self.es.index(index=self.index,body=doc)
        self.count()
--- a/onioningestor/loghandler.py
+++ b/onioningestor/loghandler.py
@ -0,0 +1,33 @@
 import os
 import logging
 from pathlib import Path
 class LoggerHandler():
    def __init__(self, level):
        self.level = getattr(logging, level)
        self.logger = logging.getLogger("OnionScraper")
        self.logger.setLevel(self.level)
        # create console handler and set level to debug
        ch = logging.StreamHandler()
        ch.setLevel(self.level)
        # create file logging
        logFile = Path(__file__).parents[1]
        logging_path = os.path.join(logFile, "info.log")
        fh = logging.FileHandler(logging_path)
        # create formatter
        formatter = logging.Formatter('[%(asctime)s] - %(name)s - %(levelname)s - %(message)s',datefmt='%a, %d %b %Y %H:%M:%S')
        formatter_console = logging.Formatter('[%(asctime)s] - %(levelname)s - %(message)s',datefmt='%d %b %Y %H:%M:%S')
        # add formatter to ch
        ch.setFormatter(formatter_console)
        fh.setFormatter(formatter)
        # add ch to logger
        self.logger.addHandler(ch)  #added logging into console
        self.logger.addHandler(fh)  #added logging into file
    def start_logging(self):
        self.logger.info('Starting OnionScraper')
        return self.logger
--- a/onioningestor/operators/init.py
+++ b/onioningestor/operators/init.py
@ -0,0 +1,95 @@
 import re
 import sys
 import json
 class Operator:
    """Base class for all Operator plugins.
    Note: This is an abstract class. You must extend ``__init__`` and call
    ``super`` to ensure this class's constructor is called. You must override
    ``handle_artifact`` with the same signature. You may define additional
    ``handle_{artifact_type}`` methods as needed (see the threatkb operator for
    an example) - these methods are purely convention, and are not required.
    When adding additional methods to child classes, consider prefixing the
    method name with an underscore to denote a ``_private_method``. Do not
    override other existing methods from this class.
    """
    def __init__(self, logger, elasticsearch, allowed_sources=None):
        """Override this constructor in child classes.
        The arguments above (artifact_types, filter_string, allowed_sources)
        should be accepted explicity as above, in all child classes.
        Additional arguments should be added: url, auth, etc, whatever is
        needed to set up the object.
        Each operator should default self.artifact_types to a list of Artifacts
        supported by the plugin, and allow passing in artifact_types to
        overwrite that default.
        Example:
        >>> self.artifact_types = artifact_types or [
        ...     artifacts.IPAddress,
        ...     artifacts.Domain,
        ... ]
        It's recommended to call this __init__ method via super from all child
        classes. Remember to do so *before* setting any default artifact_types.
        """
        self.logger = logger
        self.blacklist = re.compile('|'.join([re.escape(word) for word in allowed_sources]), re.IGNORECASE)
        self.es = elasticsearch
    def response(self, content, onion, operator_name):
        """
        status: success/failure
        content: dict
        onion: str
        return: dict
        """
        try:
            return {operator_name: json.loads(str(content)), 'hiddenService': onion}
        except json.decoder.JSONDecodeError as e:
            self.logger.info('JosnDecode Error')
            return {operator_name: content, 'hiddenService': onion}
        #except TypeError:
        #    return {operator_name: None, 'hiddenService': onion}
        except Exception as e:
            self.logger.error(e)
    def handle_onion(self, url):
        """Override with the same signature.
        :param artifact: A single ``Artifact`` object.
        :returns: None (always ignored)
        """
        raise NotImplementedError()
    def _onion_is_allowed(self, response, type='URL'):
        """Returns True if this is allowed by this plugin's filters."""
        # Must be in allowed_sources, if set.
        if type == 'URL':
            print(response)
            blacklist = self.blacklist.findall(response['hiddenService'])
        elif type == 'HTML':
            response['simple-html'].pop('status')
            response['simple-html']['status'] = 'blocked'
            blacklist = self.blacklist.findall(response['simple-html']['HTML'])
        if blacklist:
            self.es.save(response)
            return False
        return True
    def process(self, onions):
        """Process all applicable onions."""
        for onion in onions:
            if self._onion_is_allowed(
                    self.response({'status':'blocked'},onion.url,'regex-blacklist'),
                    type='URL'):
                self.handle_onion(onion.url)
--- a/onioningestor/operators/html.py
+++ b/onioningestor/operators/html.py
@ -0,0 +1,99 @@
 import time
 import json
 import traceback
 from datetime import datetime as dt
 from json.decoder import JSONDecodeError
 import requests
 from bs4 import BeautifulSoup
 from langdetect import detect
 from stem.control import Controller
 from stem import Signal
 from onioningestor.operators import Operator
 class Plugin(Operator):
    """Simple-html
    This plugin collects HTML code from onion link
    """
    def __init__(self, logger, elasticsearch, allowed_sources, **kwargs):
        super(Plugin, self).__init__(logger, elasticsearch, allowed_sources)
        self.plugin_name = 'simple-html'
        self.logger.info(f"Initializing {self.plugin_name}")
        self.timeout = int(kwargs['timeout'])
        self.retries = int(kwargs['retries'])
        self.proxy = kwargs['socks5']
        self.torControl = kwargs['TorController']
        self.headers ={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language':'en-US,en;q=0.5',
            'DNT': '1', 'Connection':
            'keep-alive',
            'Upgrade-Insecure-Requests': '1'}
    def get_tor_session(self):
        try:
            s = requests.session()
            s.proxies = self.proxy
            s.headers.update(self.headers)
        except Exception as e:
            self.logger.error(e)
            self.logger.debug(traceback.print_exc())
        return s
    def renew_connection(self):
        with Controller.from_port(port = self.torControl['port']) as controller:
            # Now we switch TOR identities to make sure we have a good connection
            self.logger.info('Getting new Tor IP')
            # authenticate to our local TOR controller
            controller.authenticate(self.torControl['password'])
            # send the signal for a new identity
            controller.signal(Signal.NEWNYM)
            # wait for the new identity to be initialized
            time.sleep(controller.get_newnym_wait())
            session = self.get_tor_session()
            self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
    def run_sessions(self, onion):
            retry = 0
            result = None
            while True:
                try:
                    url = 'http://'+onion
                    self.logger.info(url)
                    content = self.get_tor_session().get(url)
                    if content.status_code == 200:
                        result = content.text
                        if result:
                            html = BeautifulSoup(result,features="lxml")
                            index = {'HTML':result,'title':html.title.text,'language':detect(html.text),'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z','status':'success'}
                            return self.response(index, onion, self.plugin_name)
                except requests.exceptions.ConnectionError as connection_error:
                    self.logger.error(f'Failed connecting to http://{url}')
                    self.logger.debug(connection_error)
                except Exception as e:
                    self.logger.error(e)
                    self.logger.debug(traceback.print_exc())
                self.logger.info('[x] No results found retrying ...')
                retry += 1
                self.renew_connection()
                if retry > self.retries:
                    self.logger.error('[x] Max retries exceeded')
                    return self.response({'status':"failure"}, onion, self.plugin_name)
    def handle_onion(self, onion):
        content = self.run_sessions(onion)
        print(content)
        if content[self.plugin_name]['status'] == 'success':
            if self._onion_is_allowed(content):
                self.es.save(content)
--- a/onioningestor/operators/onionscan.py
+++ b/onioningestor/operators/onionscan.py
@ -0,0 +1,264 @@
 import re
 import os
 import sys
 import json
 import time
 import random
 import traceback
 import subprocess
 from uuid import uuid4
 from pathlib import Path
 from datetime import datetime as dt
 from json.decoder import JSONDecodeError
 from concurrent.futures import ProcessPoolExecutor
 from threading import Timer
 import requests
 from stem.control import Controller
 from stem import Signal
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 from onionscraper.operators import Operator
 class Plugin(Operator):
    """OnionScraper main work logic.
    Handles reading the config file, calling sources, maintaining state and
    sending artifacts to operators.
    """
    def __init__(self, logger, **kwargs):
        self.logger = logger
        self.logger.info('Initializing OnionScanner')
        screenshots = kwargs.pop('screenshots_path', None)
        if screenshots:
            self.screenshots = Path(screenshots)
        else:
            self.screenshots = Path(__file__).parents[1]/'screenshots'
        self.onionscan = kwargs['binpath']
        self.timeout = int(kwargs['timeout'])
        self.proxy = kwargs['socks5']
        self.torControl = kwargs['TorController']
        self.retries = int(kwargs['retries'])
        self.headers ={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language':'en-US,en;q=0.5',
            'DNT': '1', 'Connection':
            'keep-alive',
            'Upgrade-Insecure-Requests': '1'}
        blacklist = kwargs['blacklist'].split(',')
        self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE)
        keywords = kwargs['interestingKeywords'].split(',')
        self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE)
        self.session = self.get_tor_session()
    def response(self, status, content, onion):
        """
        status: success/failure
        content: dict
        onion: str
        return: dict
        """
        return {'status': status, 'data': content, 'onion': onion}
    def parseDoc(self, data):
        data['onionscan'].pop('simpleReport', None)
        crawls = data['onionscan'].pop('crawls', None)
        hiddenService = data['onionscan'].pop('hiddenService', None)
        data['onionscan']['crawls'] = [*crawls]
        data['hiddenService'] = hiddenService
        for onion in crawls.keys():
            print(onion)
            with open('/home/tony/Projects/OnionScraper_v2/onion_master_list.txt', 'a') as fp:
                fp.write("%s\n" % onion)
            #q.enqueue(self.crawl, onion)
        #with open('test.json', 'w', encoding='utf-8') as f:
        #    json.dump(data, f, ensure_ascii=False, indent=4)
        return data
    def format_directory(self, directory):
        d = dt.now()
        year = str(d.year)
        month = str(d.month)
        # prefix month and day with "0" if it is only one digit
        if len(month) < 2:
                month = "0" + month
        day = str(d.day)
        if len(day) < 2:
                day = "0" + day
        save_path = directory/year/month/day
        if not os.path.isdir(save_path):
            self.logger.info("[*] Creating directory to save screenshots")
            os.makedirs(save_path)
        return save_path
    def take_screenshot(self, save_path, onion):
        binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver')
        fp = webdriver.FirefoxProfile()
        fp.set_preference('network.proxy.type', 1)
        fp.set_preference('network.proxy.socks', '127.0.0.1')
        fp.set_preference('network.proxy.socks_port', 9050)
        fp.set_preference('network.proxy.socks_remote_dns', True)
        options = Options()
        options.headless = True
        driver = webdriver.Firefox(
                executable_path='/home/tony/Projects/OnionScraper/geckodriver',
                options=options,
                firefox_profile=fp)
        url = 'http://' + onion
        driver.get(url)
        uid = str(uuid4()).split('-')[0]
        filename = f"{onion}_screenshot_{uid}.png"
        f_name = f"{save_path}/{filename}"
        driver.save_screenshot(f_name)
        driver.quit()
        if os.path.isfile(f_name):
            self.logger.info(f'[*] Screenshot was taken. {f_name}')
            dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'
            result = {'dateScreenshoted':dateScreenshoted,'filename':filename}
            return self.response("success",result,onion)
        else:
            self.logger.error('[x] Unable to take screenshot')
            return self.response("failure",None,onion)
    def get_tor_session(self):
        try:
            s = requests.session()
            s.proxies = self.proxy
            s.headers.update(self.headers)
        except Exception as e:
            self.logger.error(e)
            self.logger.debug(traceback.print_exc())
        return s
    # signal TOR for a new connection
    def renew_connection(self):
        with Controller.from_port(port = self.torControl['port']) as controller:
            # Now we switch TOR identities to make sure we have a good connection
            self.logger.info('Getting new Tor IP')
            # authenticate to our local TOR controller
            controller.authenticate(self.torControl['password'])
            # send the signal for a new identity
            controller.signal(Signal.NEWNYM)
            # wait for the new identity to be initialized
            time.sleep(controller.get_newnym_wait())
            session = self.get_tor_session()
            self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
    def handle_timeout(self, process, onion):
        #
        # Handle a timeout from the onionscan process.
        #
        try:
            # kill the onionscan process
            process.kill()
            self.logger.info("[!!!] Killed the onionscan process.")
        except:
            pass
        self.renew_connection()
        return
    def run_sessions(self, onion):
            retry = 0
            result = None
            while True:
                try:
                    url = 'http://'+onion
                    self.logger.info(url)
                    content = self.session.get(url)
                    if content.status_code == 200:
                        result = content.json()
                except JSONDecodeError as e:
                    self.logger.debug(f'JSONDecodeError {e}')
                    result = content.text
                except Exception as e:
                    self.logger.error(e)
                    self.logger.debug(traceback.print_exc())
                finally:
                    if result:
                        return self.response("success",result,onion)
                    else:
                        self.logger.info('[x] No results found retrying ...')
                        retry += 1
                        self.renew_connection()
                if retry > self.retries:
                    self.logger.error('[x] Max retries exceeded')
                    return self.response("failure",None, onion)
    def run_onionscan(self, onion):
        self.logger.info("[*] Running onionscan on %s", onion)
        # fire up onionscan
        process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
        # start the timer and let it run till timeout minutes
        process_timer = Timer(300,self.handle_timeout,args=[process,onion])
        process_timer.start()
        # wait for the onion scan results
        stdout = process.communicate()[0]
        # we have received valid results so we can kill the timer
        if process_timer.is_alive():
            process_timer.cancel()
            try:
                return self.response("success",json.loads(stdout),onion)
            except json.decoder.JSONDecodeError:
                pass
        self.logger.info("[!!!] Process timed out for %s", onion)
        return self.response("failure",None, onion)
    def handle_onion(self, onion_tuple):
        onion = onion_tuple.url
        self.logger.info(f'Processing {onion} with onionscan')
        try:
            blacklist_URL = self.blacklist.search(onion)
            if blacklist_URL:
                self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}")
            else:
                self.logger.debug("[*] URL blacklist test: PASSED")
                results = self.run_onionscan(onion)
                if results['status'] == 'success':# and results['data']['webDetected'] == 'true':
                    content = self.run_sessions(onion)
                    if content['status'] == 'success':
                        blacklist_CONTENT = self.blacklist.search(content['data'])
                        if blacklist_CONTENT:
                            self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}")
                        else:
                            self.logger.debug("[*] CONTENT blacklist test: PASSED")
                            screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion)
                            self.logger.info("Indexing!")
                            doc = {
                                    'onionscan':json.loads(results['data']),
                                    'html':content['data'],
                                    'screenshots':screenshot['data'],
                                    'interestingKeywords':self.interestingKeywords.findall(content['data'])
                                    }
                            return self.parseDoc(doc)
                else:
                    self.logger.info(f"[x] hidden service {onion} is not active")
        except Exception as e:
            self.logger.error(e)
            self.logger.error(traceback.print_exc())
        finally:
            pass
            #sys.exit(0)
--- a/onioningestor/operators/yara.py
+++ b/onioningestor/operators/yara.py
@ -0,0 +1,15 @@
 from onionscraper.operators import Operator
 class Plugin(Operator):
    """Operator for output to flat CSV file."""
    def __init__(self, filename, base_score):
        """CSV operator."""
        self.filename = filename
        #super(Plugin, self).__init__(artifact_types, filter_string, allowed_sources)
    def handle_artifact(self, artifact):
        """Operate on a single artifact."""
        pass
--- a/onioningestor/sources/init.py
+++ b/onioningestor/sources/init.py
@ -0,0 +1,41 @@
 from collections import namedtuple
 class Source(object):
    """Base class for all Source plugins.
    Note: This is an abstract class. You must override ``__init__`` and ``run``
    in child classes. You should not override ``process_element``. When adding
    additional methods to child classes, consider prefixing the method name
    with an underscore to denote a ``_private_method``.
    """
    def __init__(self, name, *args, **kwargs):
        """Override this constructor in child classes.
        The first argument must always be ``name``.
        Other argumentss should be url, auth, etc, whatever is needed to set
        up the object.
        """
        self.onion = namedtuple('onion', ['url','source','type'])
    def run(self):
        """Run and return ``(saved_state, list(Artifact))``.
        Override this method in child classes.
        The method signature and return values must remain consistent.
        The method should attempt to pick up where we left off using
        ``saved_state``, if supported. If ``saved_state`` is ``None``, you can
        assume this is a first run. If state is maintained by the remote
        resource (e.g. as it is with SQS), ``saved_state`` should always be
        ``None``.
        """
        raise NotImplementedError()
    def process_element(self, content, reference_link, include_nonobfuscated=False):
        """Take a single source content/url and return a list of Artifacts.
        This is the main work block of Source plugins, which handles
        IOC extraction and artifact creation.
        :param content: String content to extract from.
        :param reference_link: Reference link to attach to all artifacts.
        :param include_nonobfuscated: Include non-defanged URLs in output?
        """
        logger.debug(f"Processing in source '{self.name}'")
--- a/onioningestor/sources/gist.py
+++ b/onioningestor/sources/gist.py
@ -0,0 +1,153 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 __author__ = 'Andrey Glauzer'
 __license__ = "MIT"
 __version__ = "1.0.1"
 __maintainer__ = "Andrey Glauzer"
 __status__ = "Development"
 import requests
 import json
 import re
 import re
 import urllib.parse
 from random import choice
 import time
 from bs4 import BeautifulSoup
 from onionscraper.sources import Source
 class Plugin(Source):
    def __init__(self, logger, name, url):
        self.logger = logger
        self.name = name
        self.url = url
        self.desktop_agents = [
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
        super().__init__(self)
    def run(self):
        self.logger.info('Starting Gist Scraper')
        self.cookies()
        self.pagination()
        self.scraping()
        return self.raw()
    @property
    def random_headers(self):
        return {
            'User-Agent': choice(self.desktop_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        }
    def cookies(self):
        self.logger.info('Setting GIST cookies')
        with requests.Session() as self.session:
            self.headers = self.random_headers
            request = self.session.get(self.url, headers=self.headers)
            if request.status_code == 200:
                pass
            else:
                self.logger.error('No Response from GIST')
    def pagination(self):
        request = self.session.get(
            f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)
        self.soup = BeautifulSoup(request.content, features="lxml")
        pages = []
        self.urls = [self.url]
        try:
            for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):
                pages.append(pagination.get_text())
        except:
            pages = False
        if pages:
            cont = 2
            while cont <= 1:  # int(pages[-2]):
                cont += 1
                full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"
                self.urls.append(full_url)
    def scraping(self):
        url = []
        for inurl in self.urls:
            self.logger.info(f"Connecting to {inurl}")
            time.sleep(5)
            request = self.session.get(inurl, headers=self.headers)
            if request.status_code == 200:
                soup = BeautifulSoup(request.content, features="lxml")
                for code in soup.findAll('div', {'class': 'gist-snippet'}):
                    if '.onion' in code.get_text().lower():
                        for raw in code.findAll('a', {'class': 'link-overlay'}):
                            try:
                                url.append(raw['href'])
                            except:
                                pass
            self.urls_raw = []
            for get in url:
                self.logger.info(f"Connecting to {get}")
                time.sleep(5)
                try:
                    request = self.session.get(get, headers=self.headers)
                    if request.status_code == 200:
                        soup = BeautifulSoup(request.content, features="lxml")
                        for raw in soup.findAll('a', {'class': 'btn btn-sm'}):
                            try:
                                gist_url = f"https://gist.githubusercontent.com{raw['href']}"
                                self.urls_raw.append(gist_url)
                            except:
                                pass
                except(requests.exceptions.ConnectionError,
                       requests.exceptions.ChunkedEncodingError,
                       requests.exceptions.ReadTimeout,
                       requests.exceptions.InvalidURL) as e:
                    self.logger.error(
                        f"I was unable to connect to the url, because an error occurred.\n{e}")
                    pass
    def raw(self):
        self.logger.info('Performing replaces and regex. WAIT...')
        itens = []
        onions = []
        for raw in self.urls_raw:
            if '.txt' in raw.lower() \
                    or '.csv' in raw.lower():
                time.sleep(5)
                request = self.session.get(raw, headers=self.headers)
                self.soup = BeautifulSoup(request.content, features="lxml")
                for pre in self.soup.findAll('body'):
                    list = pre.get_text().split('\n')
                    itens.extend(list)
                regex = re.compile(
                    "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
                for lines in itens:
                    rurls = lines \
                        .replace('\xad', '') \
                        .replace('\n', '') \
                        .replace("http://", '') \
                        .replace("https://", '') \
                        .replace("www.", "")
                    url = regex.match(rurls)
                    if url is not None:
                        onions.append(self.onion(url=url.group(), source='gist', type='domain'))
        return onions
--- a/onioningestor/sources/gmail.py
+++ b/onioningestor/sources/gmail.py
@ -0,0 +1,153 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 __author__ = 'Daneiele Perera'
 __license__ = "MIT"
 __version__ = "1.0.1"
 __maintainer__ = "Daniele Perera"
 __status__ = "Development"
 import requests
 import json
 import re
 import re
 import urllib.parse
 from random import choice
 import time
 from bs4 import BeautifulSoup
 from onionscraper.sources import Source
 class Plugin(Source):
    def __init__(self, logger, name, url):
        self.logger = logger
        self.name = name
        self.url = url
        self.desktop_agents = [
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
        super().__init__(self)
    def run(self):
        self.logger.info('Starting Gist Scraper')
        self.cookies()
        self.pagination()
        self.scraping()
        return self.raw()
    @property
    def random_headers(self):
        return {
            'User-Agent': choice(self.desktop_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        }
    def cookies(self):
        self.logger.info('Setting GIST cookies')
        with requests.Session() as self.session:
            self.headers = self.random_headers
            request = self.session.get(self.url, headers=self.headers)
            if request.status_code == 200:
                pass
            else:
                self.logger.error('No Response from GIST')
    def pagination(self):
        request = self.session.get(
            f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)
        self.soup = BeautifulSoup(request.content, features="lxml")
        pages = []
        self.urls = [self.url]
        try:
            for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):
                pages.append(pagination.get_text())
        except:
            pages = False
        if pages:
            cont = 2
            while cont <= 1:  # int(pages[-2]):
                cont += 1
                full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"
                self.urls.append(full_url)
    def scraping(self):
        url = []
        for inurl in self.urls:
            self.logger.info(f"Connecting to {inurl}")
            time.sleep(5)
            request = self.session.get(inurl, headers=self.headers)
            if request.status_code == 200:
                soup = BeautifulSoup(request.content, features="lxml")
                for code in soup.findAll('div', {'class': 'gist-snippet'}):
                    if '.onion' in code.get_text().lower():
                        for raw in code.findAll('a', {'class': 'link-overlay'}):
                            try:
                                url.append(raw['href'])
                            except:
                                pass
            self.urls_raw = []
            for get in url:
                self.logger.info(f"Connecting to {get}")
                time.sleep(5)
                try:
                    request = self.session.get(get, headers=self.headers)
                    if request.status_code == 200:
                        soup = BeautifulSoup(request.content, features="lxml")
                        for raw in soup.findAll('a', {'class': 'btn btn-sm'}):
                            try:
                                gist_url = f"https://gist.githubusercontent.com{raw['href']}"
                                self.urls_raw.append(gist_url)
                            except:
                                pass
                except(requests.exceptions.ConnectionError,
                       requests.exceptions.ChunkedEncodingError,
                       requests.exceptions.ReadTimeout,
                       requests.exceptions.InvalidURL) as e:
                    self.logger.error(
                        f"I was unable to connect to the url, because an error occurred.\n{e}")
                    pass
    def raw(self):
        self.logger.info('Performing replaces and regex. WAIT...')
        itens = []
        onions = []
        for raw in self.urls_raw:
            if '.txt' in raw.lower() \
                    or '.csv' in raw.lower():
                time.sleep(5)
                request = self.session.get(raw, headers=self.headers)
                self.soup = BeautifulSoup(request.content, features="lxml")
                for pre in self.soup.findAll('body'):
                    list = pre.get_text().split('\n')
                    itens.extend(list)
                regex = re.compile(
                    "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
                for lines in itens:
                    rurls = lines \
                        .replace('\xad', '') \
                        .replace('\n', '') \
                        .replace("http://", '') \
                        .replace("https://", '') \
                        .replace("www.", "")
                    url = regex.match(rurls)
                    if url is not None:
                        onions.append(self.onion(url=url.group(), source='gist', type='domain'))
        return onions
--- a/onioningestor/sources/reddit.py
+++ b/onioningestor/sources/reddit.py
@ -0,0 +1,120 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 __author__ = 'Andrey Glauzer'
 __license__ = "MIT"
 __version__ = "1.0.1"
 __maintainer__ = "Andrey Glauzer"
 __status__ = "Development"
 import requests
 import json
 import re
 import logging
 import re
 import urllib.parse
 from random import choice
 from bs4 import BeautifulSoup
 class Reddit:
    def __init__(self):
        self.session = requests.session()
        self.source = 'Reddit'
        self.url = 'https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000'
        self.desktop_agents = [
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
    @property
    def random_headers(self):
        return {
            'User-Agent': choice(self.desktop_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        }
    @property
    def start(self):
        self.reddit_json()
    def reddit_json(self):
        print('Getting Reddit API information')
        onionurl = []
        try:
            request = self.session.get(self.url,  headers=self.random_headers)
            loaded_json = json.loads(request.content)
            print(
                'Filtering the URLs that have the word .onion in the text')
            for data in loaded_json['data']:
                reddit_url = 'https://www.reddit.com{}'.format(
                    data['permalink'])
                try:
                    request = self.session.get(
                        reddit_url,  headers=self.random_headers)
                    soup = BeautifulSoup(request.content, features="lxml")
                    for raw in soup.findAll('a', {'rel': 'nofollow'}):
                        if 'https://' in raw['href']:
                            raw_text = self.raw(url=raw['href'])
                            if raw_text is not None:
                                print(
                                    'Applying REGEX. Wait...')
                                regex = re.compile(
                                    "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
                                for lines in raw_text.split('\n'):
                                    rurls = lines \
                                        .replace('\xad', '') \
                                        .replace('\n', '') \
                                        .replace("http://", '') \
                                        .replace("https://", '') \
                                        .replace(r'\s', '') \
                                        .replace('\t', '')
                                    xurl = regex.match(rurls)
                                    if xurl is not None:
                                        onionurl.append(xurl.group())
                except(requests.exceptions.ConnectionError,
                       requests.exceptions.ChunkedEncodingError,
                       requests.exceptions.ReadTimeout,
                       requests.exceptions.InvalidURL) as e:
                    print(
                        'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
        except(requests.exceptions.ConnectionError,
               requests.exceptions.ChunkedEncodingError,
               requests.exceptions.ReadTimeout,
               requests.exceptions.InvalidURL) as e:
            print(
                'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
        return onionurl
    def raw(self, url):
        try:
            if url is not None:
                request = self.session.get(url, headers=self.random_headers)
                print(
                    'Connecting in {url} - {status}'.format(url=url, status=request.status_code))
                if request.status_code == 200:
                    soup = BeautifulSoup(request.content, features="lxml")
                    for s in soup(['script', 'style']):
                        s.decompose()
                    return ' '.join(soup.stripped_strings)
        except (requests.exceptions.ConnectionError,
                requests.exceptions.ChunkedEncodingError,
                requests.exceptions.ReadTimeout,
                requests.exceptions.TooManyRedirects) as e:
            pass
 if __name__ == '__main__':
    app = Reddit()
    app.start
--- a/onioningestor/sources/simplefile.py
+++ b/onioningestor/sources/simplefile.py
@ -0,0 +1,31 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 __author__ = 'Andrey Glauzer'
 __license__ = "MIT"
 __version__ = "1.0.1"
 __maintainer__ = "Andrey Glauzer"
 __status__ = "Development"
 import requests
 from pathlib import Path
 from onioningestor.sources import Source
 class Plugin(Source):
    def __init__(self, logger, name, filename):
        self.logger = logger
        self.name = name
        self.filename = filename
        super().__init__(self)
    def run(self):
        filepath = Path(__file__).parents[2]/self.filename
        with open(filepath, 'r') as fp:
            lines = fp.read().splitlines()
        for onion in lines:
            yield self.onion(url=onion,source='simple-file',type='domain')