renamed the package and fixed some issues

4 years ago · 8258056bef
parent 23dd01a983
commit 8258056bef
14 changed files with 1431 additions and 0 deletions
--- a/onioningestor/init.py
+++ b/onioningestor/init.py
@ -0,0 +1,131 @@
+import sys
+import time
+import traceback
+import collections
+
+from . import config
+from . import dbhandler
+from . import loghandler
+
+
+class Ingestor:
+    """ThreatIngestor main work logic.
+
+    Handles reading the config file, calling sources, maintaining state, and
+    sending artifacts to operators.
+    """
+    def __init__(self, args):
+        # Load logger
+        log = loghandler.LoggerHandler(args.logLevel)
+        self.logger = log.start_logging()
+        # Load config
+        self.config = config.Config(args.configFile, self.logger)
+        self.blacklist = self.config.blacklist()
+
+        # Load Elasticsearch.
+        try:
+            self.es = dbhandler.DbHandlerElasticSearch(
+                    self.config.elasticsearch(),
+                    self.logger)
+        except Exception as e:
+            # Error loading elasticsearch.
+            self.logger.error(e)
+            self.logger.debug(traceback.print_exc())
+            sys.exit(1)
+
+
+        # Instantiate plugins.
+        try:
+            self.logger.info("Initializing sources")
+            self.sources = {name: source(self.logger, **kwargs)
+                            for name, source, kwargs in self.config.sources()}
+
+            self.logger.info("initializing operators")
+            self.operators = {name: operator(self.logger, self.es, self.blacklist, **kwargs)
+                              for name, operator, kwargs in self.config.operators()}
+
+            self.logger.info("initializing notifiers")
+            #self.notifiers = {name: operator(**kwargs)
+            #                  for name, operator, kwargs in self.config.notifiers()}
+        except Exception as e:
+            # Error loading elasticsearch.
+            self.logger.error(e)
+            self.logger.debug(traceback.print_exc())
+            sys.exit(1)
+
+
+    def run(self):
+        """Run once, or forever, depending on config."""
+        if self.config.daemon():
+            self.logger.info("Running forever, in a loop")
+            self.run_forever()
+        else:
+            self.logger.info("Running once, to completion")
+            self.run_once()
+
+
+    def run_once(self):
+        """Run each source once, passing artifacts to each operator."""
+        # Track some statistics about artifacts in a summary object.
+        summary = collections.Counter()
+
+        for source in self.sources:
+            # Run the source to collect artifacts.
+            self.logger.info(f"Running source '{source}'")
+            try:
+                onions = self.sources[source].run()
+                if onions:
+                    self.logger.info(f'Found hidden links')
+                else:
+                    self.logger.info('No links found')
+            except Exception as e:
+                self.logger.error(e)
+                self.logger.error(traceback.print_exc())
+                continue
+
+            # Process artifacts with each operator.
+            for operator in self.operators:
+                self.logger.info(f"Processing found onions with operator '{operator}'")
+                try:
+                    doc = self.operators[operator].process(onions)
+                    # Save the source state.
+                    self.es.save(doc)
+                except Exception as e:
+                    self.logger.error(e)
+                    self.logger.error(traceback.print_exc())
+                    continue
+
+
+
+#            # Record stats and update the summary.
+#            types = artifact_types(doc.get('interestingKeywords'))
+#            summary.update(types)
+#            for artifact_type in types:
+#                self.logger.info(f'types[artifact_type]')
+
+        # Log the summary.
+        self.logger.info(f"New artifacts: {dict(summary)}")
+
+
+    def run_forever(self):
+        """Run forever, sleeping for the configured interval between each run."""
+        while True:
+            self.run_once()
+
+            self.logger.info(f"Sleeping for {self.config.sleep()} seconds")
+            time.sleep(self.config.sleep())
+
+
+def artifact_types(artifact_list):
+    """Return a dictionary with counts of each artifact type."""
+    types = {}
+    for artifact in artifact_list:
+        artifact_type = artifact.__class__.__name__.lower()
+        if artifact_type in types:
+            types[artifact_type] += 1
+        else:
+            types[artifact_type] = 1
+
+    return types
+
+
--- a/onioningestor/main.py
+++ b/onioningestor/main.py
@ -0,0 +1,49 @@
+"""OnionScraper
+
+A Python3 application for indexing and scraping hidden services ElasticSearch
+
+Installation:
+   This application assumes you have python3 and pip3 installed.
+
+   pip3 install -r requirements.txt
+
+
+This software is provided subject to the MIT license stated below.
+--------------------------------------------------
+        MIT License
+
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+--------------------------------------------------
+"""
+import argparse
+
+from onioningestor import Ingestor
+
+
+# Load arguments from user
+parser = argparse.ArgumentParser(
+        prog='onionscraper',
+        description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.add_argument('-c', '--config',dest="configFile", required = True, help='Path to config file')
+parser.add_argument("--log", dest="logLevel",default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the logging level, default is INFO")
+
+args = parser.parse_args()
+app = Ingestor(args)
+
+app.run()
--- a/onioningestor/config.py
+++ b/onioningestor/config.py
@ -0,0 +1,172 @@
+import io
+import importlib
+import traceback
+
+import yaml
+
+from pathlib import Path
+
+SOURCE = 'onioningestor.sources'
+OPERATOR = 'onioningestor.operators'
+
+INTERNAL_OPTIONS = [
+    'saved_state',
+    'module',
+    'credentials',
+]
+
+ARTIFACT_TYPES = 'artifact_types'
+FILTER_STRING = 'filter'
+ALLOWED_SOURCES = 'allowed_sources'
+NAME = 'name'
+
+
+class Config:
+    """Config read/write operations, and convenience methods."""
+    def __init__(self, filename, logger):
+        """Read a config file."""
+        self.logger = logger
+        self.filename = filename
+        with io.open(self.filename, 'r') as f:
+            try:
+                self.logger.info("Loading config file")
+                self.config = yaml.safe_load(f.read())
+            except yaml.error.YAMLError:
+                self.logger.error("YAML error in config")
+
+
+    @staticmethod
+    def _load_plugin(plugin_type, plugin):
+        """Returns plugin class or raises an exception.
+        :raises: threatingestor.exceptions.PluginError
+        """
+        try:
+            module = importlib.import_module('.'.join([plugin_type, plugin]))
+            return module.Plugin
+        except Exception as e:
+            print(e)
+            print(traceback.print_exc())
+
+    def daemon(self):
+        """Returns boolean, are we daemonizing?"""
+        return self.config['general']['daemon']
+
+
+    def elasticsearch(self):
+        """Returns elasticsaerch config"""
+        return self.config['general']['elasticsearch']
+
+
+    def sleep(self):
+        """Returns number of seconds to sleep between iterations, if daemonizing."""
+        return self.config['general']['sleep']
+
+    def blacklist(self):
+        return self.config['general']['blacklist'].split(',')
+
+#    def onionscanner(self):
+#        """Returns onionscanner config dict"""
+#        screenshots = self.config['onionscanner'].pop('screenshots_path', None)
+#        if screenshots:
+#            self.config['onionscanner']['screenshots_path'] = Path(screenshots)
+#        else:
+#            self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots'
+#        blacklist = self.config['onionscanner'].pop('blacklist', None)
+#        if blacklist:
+#            self.config['onionscanner']['blacklist'] = blacklist.split(',')
+#        interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None)
+#        if interestingKeywords:
+#            self.config['onionscanner']['interestingKeywords'] = blacklist.split(',')
+#        return self.config['onionscanner']
+
+
+    def notifiers(self):
+        """Returns notifiers config dictionary."""
+        return self.config.get('notifiers', {})
+
+
+    def logging(self):
+        """Returns logging config dictionary."""
+        return self.config.get('logging', {})
+
+
+    def credentials(self, credential_name):
+        """Return a dictionary with the specified credentials."""
+        for credential in self.config['credentials']:
+            for key, value in credential.items():
+                if key == NAME and value == credential_name:
+                    return credential
+        return {}
+
+
+    def sources(self):
+        """Return a list of (name, Source class, {kwargs}) tuples.
+        :raises: threatingestor.exceptions.PluginError
+        """
+        sources = []
+
+        for source in self.config['sources']:
+            kwargs = {}
+            for key, value in source.items():
+                if key not in INTERNAL_OPTIONS:
+                    kwargs[key] = value
+
+                elif key == 'credentials':
+                    # Grab these named credentials
+                    credential_name = value
+                    for credential_key, credential_value in self.credentials(credential_name).items():
+                        if credential_key != NAME:
+                            kwargs[credential_key] = credential_value
+
+            # load and initialize the plugin
+            self.logger.info(f"Found source '{source[NAME]}'")
+            sources.append((source[NAME], self._load_plugin(SOURCE, source['module']), kwargs))
+
+        self.logger.info(f"Found {len(sources)} total sources")
+        return sources
+
+
+    def operators(self):
+        """Return a list of (name, Operator class, {kwargs}) tuples.
+        :raises: threatingestor.exceptions.PluginError
+        """
+        operators = []
+        for operator in self.config['operators']:
+            kwargs = {}
+            for key, value in operator.items():
+                if key not in INTERNAL_OPTIONS:
+                    if key == ARTIFACT_TYPES:
+                        # parse out special artifact_types option
+                        artifact_types = []
+                        for artifact in value:
+                            try:
+                                artifact_types.append(threatingestor.artifacts.STRING_MAP[artifact.lower().strip()])
+                            except KeyError:
+                                # ignore invalid artifact types
+                                pass
+                        kwargs[key] = artifact_types
+
+                    elif key == FILTER_STRING:
+                        # pass in special filter_string option
+                        kwargs['filter_string'] = value
+
+                    elif key == NAME:
+                        # exclude name key from operator kwargs, since it's not used
+                        pass
+
+                    else:
+                        kwargs[key] = value
+
+                elif key == 'credentials':
+                    # Grab these named credentials
+                    credential_name = value
+                    for credential_key, credential_value in self.credentials(credential_name).items():
+                        if credential_key != NAME:
+                            kwargs[credential_key] = credential_value
+
+            # load and initialize the plugin
+            self.logger.info(f"Found operator '{operator[NAME]}'")
+            operators.append((operator[NAME], self._load_plugin(OPERATOR, operator['module']), kwargs))
+
+        self.logger.info(f"Found {len(operators)} total operators")
+        return operators
--- a/onioningestor/dbhandler.py
+++ b/onioningestor/dbhandler.py
@ -0,0 +1,75 @@
+import sys
+import traceback
+
+from elasticsearch import Elasticsearch, helpers
+
+class DbHandlerElasticSearch:
+    def __init__(self, config, logger):
+        self.logger = logger
+        self.logger.info('Creating Elasticsearch mapping')
+        self.config = config
+        self.mapping = '''
+        {
+          "mappings": {
+            "_doc": {
+              "properties": {
+                "hiddenService": {
+                  "type": "text"
+                },
+                "blacklist": {
+                  "type": "keyword"
+                },      
+                "monitor": {
+                  "type": "boolean"
+                },
+                "simple-html": {
+                  "type": "nested",
+                  "properties": {
+                    "HTML": {
+                      "type": "long"
+                    },
+                    "title": {
+                      "type": "text"
+                    },
+                    "language": {
+                      "type": "text"
+                    },
+                    "status":{
+                      "type":"text"
+                    },
+                    "date-indexed": {
+                      "type": "date"
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        '''
+        self.index = self.config['index']
+        try:
+            self.es = Elasticsearch([{
+                'host':self.config['host'],
+                'port':self.config['port']}])
+            self.es.indices.create(
+                    index=self.index,
+                    body=self.mapping,
+                    ignore=400)
+        except Exception as e:
+            self.logger.error(e)
+            self.logger.error(traceback.format_exc())
+            sys.exit(0)
+
+    def count(self):
+        self.es.indices.refresh(self.index)
+        status = self.es.count(index=self.index)
+        if status['_shards']['successful'] == 1:
+            self.logger.info('Successful')
+            self.logger.info('Count:%d',status['count'])
+        else:
+            self.logger.error(status)
+
+    def save(self, doc):
+        self.es.index(index=self.index,body=doc)
+        self.count()
--- a/onioningestor/loghandler.py
+++ b/onioningestor/loghandler.py
@ -0,0 +1,33 @@
+import os
+import logging
+from pathlib import Path
+
+class LoggerHandler():
+    def __init__(self, level):
+        self.level = getattr(logging, level)
+        self.logger = logging.getLogger("OnionScraper")
+        self.logger.setLevel(self.level)
+
+        # create console handler and set level to debug
+        ch = logging.StreamHandler()
+        ch.setLevel(self.level)
+
+        # create file logging
+        logFile = Path(__file__).parents[1]
+        logging_path = os.path.join(logFile, "info.log")
+        fh = logging.FileHandler(logging_path)
+
+        # create formatter
+        formatter = logging.Formatter('[%(asctime)s] - %(name)s - %(levelname)s - %(message)s',datefmt='%a, %d %b %Y %H:%M:%S')
+        formatter_console = logging.Formatter('[%(asctime)s] - %(levelname)s - %(message)s',datefmt='%d %b %Y %H:%M:%S')
+        # add formatter to ch
+        ch.setFormatter(formatter_console)
+        fh.setFormatter(formatter)
+        # add ch to logger
+        self.logger.addHandler(ch)  #added logging into console
+        self.logger.addHandler(fh)  #added logging into file
+
+    def start_logging(self):
+        self.logger.info('Starting OnionScraper')
+        return self.logger
+
--- a/onioningestor/operators/init.py
+++ b/onioningestor/operators/init.py
@ -0,0 +1,95 @@
+import re
+import sys
+import json
+
+
+class Operator:
+    """Base class for all Operator plugins.
+
+    Note: This is an abstract class. You must extend ``__init__`` and call
+    ``super`` to ensure this class's constructor is called. You must override
+    ``handle_artifact`` with the same signature. You may define additional
+    ``handle_{artifact_type}`` methods as needed (see the threatkb operator for
+    an example) - these methods are purely convention, and are not required.
+
+    When adding additional methods to child classes, consider prefixing the
+    method name with an underscore to denote a ``_private_method``. Do not
+    override other existing methods from this class.
+    """
+    def __init__(self, logger, elasticsearch, allowed_sources=None):
+        """Override this constructor in child classes.
+
+        The arguments above (artifact_types, filter_string, allowed_sources)
+        should be accepted explicity as above, in all child classes.
+
+        Additional arguments should be added: url, auth, etc, whatever is
+        needed to set up the object.
+
+        Each operator should default self.artifact_types to a list of Artifacts
+        supported by the plugin, and allow passing in artifact_types to
+        overwrite that default.
+
+        Example:
+
+        >>> self.artifact_types = artifact_types or [
+        ...     artifacts.IPAddress,
+        ...     artifacts.Domain,
+        ... ]
+
+        It's recommended to call this __init__ method via super from all child
+        classes. Remember to do so *before* setting any default artifact_types.
+        """
+        self.logger = logger
+        self.blacklist = re.compile('|'.join([re.escape(word) for word in allowed_sources]), re.IGNORECASE)
+        self.es = elasticsearch
+
+    def response(self, content, onion, operator_name):
+        """
+        status: success/failure
+        content: dict
+        onion: str
+        return: dict
+        """
+        try:
+            return {operator_name: json.loads(str(content)), 'hiddenService': onion}
+        except json.decoder.JSONDecodeError as e:
+            self.logger.info('JosnDecode Error')
+            return {operator_name: content, 'hiddenService': onion}
+        #except TypeError:
+        #    return {operator_name: None, 'hiddenService': onion}
+        except Exception as e:
+            self.logger.error(e)
+
+    def handle_onion(self, url):
+        """Override with the same signature.
+
+        :param artifact: A single ``Artifact`` object.
+        :returns: None (always ignored)
+        """
+        raise NotImplementedError()
+
+
+    def _onion_is_allowed(self, response, type='URL'):
+        """Returns True if this is allowed by this plugin's filters."""
+        # Must be in allowed_sources, if set.
+        if type == 'URL':
+            print(response)
+            blacklist = self.blacklist.findall(response['hiddenService'])
+        elif type == 'HTML':
+            response['simple-html'].pop('status')
+            response['simple-html']['status'] = 'blocked'
+            blacklist = self.blacklist.findall(response['simple-html']['HTML'])
+        if blacklist:
+            self.es.save(response)
+            return False
+        return True
+
+
+    def process(self, onions):
+        """Process all applicable onions."""
+        for onion in onions:
+            if self._onion_is_allowed(
+                    self.response({'status':'blocked'},onion.url,'regex-blacklist'),
+                    type='URL'):
+                self.handle_onion(onion.url)
+
--- a/onioningestor/operators/html.py
+++ b/onioningestor/operators/html.py
@ -0,0 +1,99 @@
+import time
+import json
+import traceback
+from datetime import datetime as dt
+from json.decoder import JSONDecodeError
+
+import requests
+
+from bs4 import BeautifulSoup
+
+from langdetect import detect
+
+from stem.control import Controller
+from stem import Signal
+
+from onioningestor.operators import Operator
+
+
+class Plugin(Operator):
+    """Simple-html
+    This plugin collects HTML code from onion link
+    """
+
+    def __init__(self, logger, elasticsearch, allowed_sources, **kwargs):
+        super(Plugin, self).__init__(logger, elasticsearch, allowed_sources)
+        self.plugin_name = 'simple-html'
+        self.logger.info(f"Initializing {self.plugin_name}")
+
+        self.timeout = int(kwargs['timeout'])
+        self.retries = int(kwargs['retries'])
+
+        self.proxy = kwargs['socks5']
+        self.torControl = kwargs['TorController']
+        self.headers ={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language':'en-US,en;q=0.5',
+            'DNT': '1', 'Connection':
+            'keep-alive',
+            'Upgrade-Insecure-Requests': '1'}
+
+    def get_tor_session(self):
+        try:
+            s = requests.session()
+            s.proxies = self.proxy
+            s.headers.update(self.headers)
+        except Exception as e:
+            self.logger.error(e)
+            self.logger.debug(traceback.print_exc())
+        return s
+
+    def renew_connection(self):
+        with Controller.from_port(port = self.torControl['port']) as controller:
+            # Now we switch TOR identities to make sure we have a good connection
+            self.logger.info('Getting new Tor IP')
+            # authenticate to our local TOR controller
+            controller.authenticate(self.torControl['password'])
+            # send the signal for a new identity
+            controller.signal(Signal.NEWNYM)
+            # wait for the new identity to be initialized
+            time.sleep(controller.get_newnym_wait())
+            session = self.get_tor_session()
+            self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
+
+    def run_sessions(self, onion):
+            retry = 0
+            result = None
+            while True:
+                try:
+                    url = 'http://'+onion
+                    self.logger.info(url)
+                    content = self.get_tor_session().get(url)
+                    if content.status_code == 200:
+                        result = content.text
+                        if result:
+                            html = BeautifulSoup(result,features="lxml")
+                            index = {'HTML':result,'title':html.title.text,'language':detect(html.text),'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z','status':'success'}
+                            return self.response(index, onion, self.plugin_name)
+                except requests.exceptions.ConnectionError as connection_error:
+                    self.logger.error(f'Failed connecting to http://{url}')
+                    self.logger.debug(connection_error)
+                except Exception as e:
+                    self.logger.error(e)
+                    self.logger.debug(traceback.print_exc())
+
+                self.logger.info('[x] No results found retrying ...')
+                retry += 1
+                self.renew_connection()
+                if retry > self.retries:
+                    self.logger.error('[x] Max retries exceeded')
+                    return self.response({'status':"failure"}, onion, self.plugin_name)
+
+    def handle_onion(self, onion):
+        content = self.run_sessions(onion)
+        print(content)
+        if content[self.plugin_name]['status'] == 'success':
+            if self._onion_is_allowed(content):
+                self.es.save(content)
+
--- a/onioningestor/operators/onionscan.py
+++ b/onioningestor/operators/onionscan.py
@ -0,0 +1,264 @@
+import re
+import os
+import sys
+import json
+import time
+import random
+import traceback
+import subprocess
+from uuid import uuid4
+from pathlib import Path
+from datetime import datetime as dt
+from json.decoder import JSONDecodeError
+from concurrent.futures import ProcessPoolExecutor
+from threading import Timer
+
+import requests
+
+from stem.control import Controller
+from stem import Signal
+
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+
+from onionscraper.operators import Operator
+
+class Plugin(Operator):
+    """OnionScraper main work logic.
+
+    Handles reading the config file, calling sources, maintaining state and
+    sending artifacts to operators.
+    """
+    def __init__(self, logger, **kwargs):
+        self.logger = logger
+        self.logger.info('Initializing OnionScanner')
+        screenshots = kwargs.pop('screenshots_path', None)
+        if screenshots:
+            self.screenshots = Path(screenshots)
+        else:
+            self.screenshots = Path(__file__).parents[1]/'screenshots'
+        self.onionscan = kwargs['binpath']
+        self.timeout = int(kwargs['timeout'])
+        self.proxy = kwargs['socks5']
+        self.torControl = kwargs['TorController']
+        self.retries = int(kwargs['retries'])
+        self.headers ={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language':'en-US,en;q=0.5',
+            'DNT': '1', 'Connection':
+            'keep-alive',
+            'Upgrade-Insecure-Requests': '1'}
+
+
+        blacklist = kwargs['blacklist'].split(',')
+        self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE)
+        keywords = kwargs['interestingKeywords'].split(',')
+        self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE)
+        self.session = self.get_tor_session()
+
+    def response(self, status, content, onion):
+        """
+        status: success/failure
+        content: dict
+        onion: str
+        return: dict
+        """
+        return {'status': status, 'data': content, 'onion': onion}
+
+    def parseDoc(self, data):
+        data['onionscan'].pop('simpleReport', None)
+        crawls = data['onionscan'].pop('crawls', None)
+        hiddenService = data['onionscan'].pop('hiddenService', None)
+        data['onionscan']['crawls'] = [*crawls]
+        data['hiddenService'] = hiddenService
+        for onion in crawls.keys():
+            print(onion)
+            with open('/home/tony/Projects/OnionScraper_v2/onion_master_list.txt', 'a') as fp:
+                fp.write("%s\n" % onion)
+            #q.enqueue(self.crawl, onion)
+        #with open('test.json', 'w', encoding='utf-8') as f:
+        #    json.dump(data, f, ensure_ascii=False, indent=4)
+        return data
+
+    def format_directory(self, directory):
+        d = dt.now()
+        year = str(d.year)
+        month = str(d.month)
+        # prefix month and day with "0" if it is only one digit
+        if len(month) < 2:
+                month = "0" + month
+        day = str(d.day)
+        if len(day) < 2:
+                day = "0" + day
+        save_path = directory/year/month/day
+        if not os.path.isdir(save_path):
+            self.logger.info("[*] Creating directory to save screenshots")
+            os.makedirs(save_path)
+
+        return save_path
+
+    def take_screenshot(self, save_path, onion):
+        binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver')
+        fp = webdriver.FirefoxProfile()
+        fp.set_preference('network.proxy.type', 1)
+        fp.set_preference('network.proxy.socks', '127.0.0.1')
+        fp.set_preference('network.proxy.socks_port', 9050)
+        fp.set_preference('network.proxy.socks_remote_dns', True)
+
+        options = Options()
+        options.headless = True
+        driver = webdriver.Firefox(
+                executable_path='/home/tony/Projects/OnionScraper/geckodriver',
+                options=options,
+                firefox_profile=fp)
+        url = 'http://' + onion
+        driver.get(url)
+        uid = str(uuid4()).split('-')[0]
+        filename = f"{onion}_screenshot_{uid}.png"
+        f_name = f"{save_path}/{filename}"
+        driver.save_screenshot(f_name)
+
+        driver.quit()
+
+        if os.path.isfile(f_name):
+            self.logger.info(f'[*] Screenshot was taken. {f_name}')
+            dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'
+            result = {'dateScreenshoted':dateScreenshoted,'filename':filename}
+            return self.response("success",result,onion)
+        else:
+            self.logger.error('[x] Unable to take screenshot')
+            return self.response("failure",None,onion)
+
+        
+    
+    def get_tor_session(self):
+        try:
+            s = requests.session()
+            s.proxies = self.proxy
+            s.headers.update(self.headers)
+        except Exception as e:
+            self.logger.error(e)
+            self.logger.debug(traceback.print_exc())
+        return s
+
+    # signal TOR for a new connection
+    def renew_connection(self):
+        with Controller.from_port(port = self.torControl['port']) as controller:
+            # Now we switch TOR identities to make sure we have a good connection
+            self.logger.info('Getting new Tor IP')
+            # authenticate to our local TOR controller
+            controller.authenticate(self.torControl['password'])
+            # send the signal for a new identity
+            controller.signal(Signal.NEWNYM)
+            # wait for the new identity to be initialized
+            time.sleep(controller.get_newnym_wait())
+            session = self.get_tor_session()
+            self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
+
+    def handle_timeout(self, process, onion):
+        #
+        # Handle a timeout from the onionscan process.
+        #
+
+        try:
+            # kill the onionscan process
+            process.kill()
+            self.logger.info("[!!!] Killed the onionscan process.")
+        except:
+            pass
+        self.renew_connection()
+        return
+
+    def run_sessions(self, onion):
+            retry = 0
+            result = None
+            while True:
+                try:
+                    url = 'http://'+onion
+                    self.logger.info(url)
+                    content = self.session.get(url)
+                    if content.status_code == 200:
+                        result = content.json()
+                except JSONDecodeError as e:
+                    self.logger.debug(f'JSONDecodeError {e}')
+                    result = content.text
+                except Exception as e:
+                    self.logger.error(e)
+                    self.logger.debug(traceback.print_exc())
+                finally:
+                    if result:
+                        return self.response("success",result,onion)
+                    else:
+                        self.logger.info('[x] No results found retrying ...')
+                        retry += 1
+                        self.renew_connection()
+                if retry > self.retries:
+                    self.logger.error('[x] Max retries exceeded')
+                    return self.response("failure",None, onion)
+
+    def run_onionscan(self, onion):
+        self.logger.info("[*] Running onionscan on %s", onion)
+
+        # fire up onionscan
+        process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+
+        # start the timer and let it run till timeout minutes
+        process_timer = Timer(300,self.handle_timeout,args=[process,onion])
+        process_timer.start()
+
+        # wait for the onion scan results
+        stdout = process.communicate()[0]
+
+        # we have received valid results so we can kill the timer
+        if process_timer.is_alive():
+            process_timer.cancel()
+            try:
+                return self.response("success",json.loads(stdout),onion)
+            except json.decoder.JSONDecodeError:
+                pass
+
+        self.logger.info("[!!!] Process timed out for %s", onion)
+
+        return self.response("failure",None, onion)
+
+    def handle_onion(self, onion_tuple):
+        onion = onion_tuple.url
+        self.logger.info(f'Processing {onion} with onionscan')
+        try:
+            blacklist_URL = self.blacklist.search(onion)
+            if blacklist_URL:
+                self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}")
+            else:
+                self.logger.debug("[*] URL blacklist test: PASSED")
+                results = self.run_onionscan(onion)
+                if results['status'] == 'success':# and results['data']['webDetected'] == 'true':
+                    content = self.run_sessions(onion)
+                    if content['status'] == 'success':
+                        blacklist_CONTENT = self.blacklist.search(content['data'])
+                        if blacklist_CONTENT:
+                            self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}")
+                        else:
+                            self.logger.debug("[*] CONTENT blacklist test: PASSED")
+                            screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion)
+                            self.logger.info("Indexing!")
+                            doc = {
+                                    'onionscan':json.loads(results['data']),
+                                    'html':content['data'],
+                                    'screenshots':screenshot['data'],
+                                    'interestingKeywords':self.interestingKeywords.findall(content['data'])
+                                    }
+                            return self.parseDoc(doc)
+
+                else:
+                    self.logger.info(f"[x] hidden service {onion} is not active")
+        except Exception as e:
+            self.logger.error(e)
+            self.logger.error(traceback.print_exc())
+        finally:
+            pass
+            #sys.exit(0)
+
+        
+
--- a/onioningestor/operators/yara.py
+++ b/onioningestor/operators/yara.py
@ -0,0 +1,15 @@
+
+from onionscraper.operators import Operator
+
+class Plugin(Operator):
+    """Operator for output to flat CSV file."""
+    def __init__(self, filename, base_score):
+        """CSV operator."""
+        self.filename = filename
+
+        #super(Plugin, self).__init__(artifact_types, filter_string, allowed_sources)
+
+
+    def handle_artifact(self, artifact):
+        """Operate on a single artifact."""
+        pass
--- a/onioningestor/sources/init.py
+++ b/onioningestor/sources/init.py
@ -0,0 +1,41 @@
+from collections import namedtuple
+
+class Source(object):
+    """Base class for all Source plugins.
+    Note: This is an abstract class. You must override ``__init__`` and ``run``
+    in child classes. You should not override ``process_element``. When adding
+    additional methods to child classes, consider prefixing the method name
+    with an underscore to denote a ``_private_method``.
+    """
+    def __init__(self, name, *args, **kwargs):
+        """Override this constructor in child classes.
+        The first argument must always be ``name``.
+        Other argumentss should be url, auth, etc, whatever is needed to set
+        up the object.
+        """
+        self.onion = namedtuple('onion', ['url','source','type'])
+
+
+    def run(self):
+        """Run and return ``(saved_state, list(Artifact))``.
+        Override this method in child classes.
+        The method signature and return values must remain consistent.
+        The method should attempt to pick up where we left off using
+        ``saved_state``, if supported. If ``saved_state`` is ``None``, you can
+        assume this is a first run. If state is maintained by the remote
+        resource (e.g. as it is with SQS), ``saved_state`` should always be
+        ``None``.
+        """
+        raise NotImplementedError()
+
+
+    def process_element(self, content, reference_link, include_nonobfuscated=False):
+        """Take a single source content/url and return a list of Artifacts.
+        This is the main work block of Source plugins, which handles
+        IOC extraction and artifact creation.
+        :param content: String content to extract from.
+        :param reference_link: Reference link to attach to all artifacts.
+        :param include_nonobfuscated: Include non-defanged URLs in output?
+        """
+        logger.debug(f"Processing in source '{self.name}'")
+
--- a/onioningestor/sources/gist.py
+++ b/onioningestor/sources/gist.py
@ -0,0 +1,153 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+__author__ = 'Andrey Glauzer'
+__license__ = "MIT"
+__version__ = "1.0.1"
+__maintainer__ = "Andrey Glauzer"
+__status__ = "Development"
+
+import requests
+import json
+import re
+import re
+import urllib.parse
+from random import choice
+import time
+from bs4 import BeautifulSoup
+
+
+from onionscraper.sources import Source
+
+
+class Plugin(Source):
+
+    def __init__(self, logger, name, url):
+        self.logger = logger
+        self.name = name
+        self.url = url
+        self.desktop_agents = [
+                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
+        super().__init__(self)
+
+
+    def run(self):
+        self.logger.info('Starting Gist Scraper')
+        self.cookies()
+        self.pagination()
+        self.scraping()
+        return self.raw()
+
+    @property
+    def random_headers(self):
+        return {
+            'User-Agent': choice(self.desktop_agents),
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+        }
+
+    def cookies(self):
+
+        self.logger.info('Setting GIST cookies')
+
+        with requests.Session() as self.session:
+            self.headers = self.random_headers
+
+            request = self.session.get(self.url, headers=self.headers)
+
+            if request.status_code == 200:
+                pass
+            else:
+                self.logger.error('No Response from GIST')
+
+    def pagination(self):
+        request = self.session.get(
+            f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)
+        self.soup = BeautifulSoup(request.content, features="lxml")
+
+        pages = []
+        self.urls = [self.url]
+        try:
+            for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):
+                pages.append(pagination.get_text())
+        except:
+            pages = False
+
+        if pages:
+            cont = 2
+            while cont <= 1:  # int(pages[-2]):
+                cont += 1
+                full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"
+                self.urls.append(full_url)
+
+    def scraping(self):
+        url = []
+        for inurl in self.urls:
+            self.logger.info(f"Connecting to {inurl}")
+            time.sleep(5)
+            request = self.session.get(inurl, headers=self.headers)
+
+            if request.status_code == 200:
+                soup = BeautifulSoup(request.content, features="lxml")
+                for code in soup.findAll('div', {'class': 'gist-snippet'}):
+                    if '.onion' in code.get_text().lower():
+                        for raw in code.findAll('a', {'class': 'link-overlay'}):
+                            try:
+                                url.append(raw['href'])
+                            except:
+                                pass
+            self.urls_raw = []
+            for get in url:
+                self.logger.info(f"Connecting to {get}")
+                time.sleep(5)
+                try:
+                    request = self.session.get(get, headers=self.headers)
+
+                    if request.status_code == 200:
+                        soup = BeautifulSoup(request.content, features="lxml")
+
+                        for raw in soup.findAll('a', {'class': 'btn btn-sm'}):
+                            try:
+                                gist_url = f"https://gist.githubusercontent.com{raw['href']}"
+
+                                self.urls_raw.append(gist_url)
+
+                            except:
+                                pass
+                except(requests.exceptions.ConnectionError,
+                       requests.exceptions.ChunkedEncodingError,
+                       requests.exceptions.ReadTimeout,
+                       requests.exceptions.InvalidURL) as e:
+                    self.logger.error(
+                        f"I was unable to connect to the url, because an error occurred.\n{e}")
+                    pass
+
+    def raw(self):
+        self.logger.info('Performing replaces and regex. WAIT...')
+        itens = []
+        onions = []
+        for raw in self.urls_raw:
+            if '.txt' in raw.lower() \
+                    or '.csv' in raw.lower():
+                time.sleep(5)
+                request = self.session.get(raw, headers=self.headers)
+                self.soup = BeautifulSoup(request.content, features="lxml")
+                for pre in self.soup.findAll('body'):
+                    list = pre.get_text().split('\n')
+                    itens.extend(list)
+
+                regex = re.compile(
+                    "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
+
+                for lines in itens:
+                    rurls = lines \
+                        .replace('\xad', '') \
+                        .replace('\n', '') \
+                        .replace("http://", '') \
+                        .replace("https://", '') \
+                        .replace("www.", "")
+
+                    url = regex.match(rurls)
+
+                    if url is not None:
+                        onions.append(self.onion(url=url.group(), source='gist', type='domain'))
+        return onions
--- a/onioningestor/sources/gmail.py
+++ b/onioningestor/sources/gmail.py
@ -0,0 +1,153 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+__author__ = 'Daneiele Perera'
+__license__ = "MIT"
+__version__ = "1.0.1"
+__maintainer__ = "Daniele Perera"
+__status__ = "Development"
+
+import requests
+import json
+import re
+import re
+import urllib.parse
+from random import choice
+import time
+from bs4 import BeautifulSoup
+
+
+from onionscraper.sources import Source
+
+
+class Plugin(Source):
+
+    def __init__(self, logger, name, url):
+        self.logger = logger
+        self.name = name
+        self.url = url
+        self.desktop_agents = [
+                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
+        super().__init__(self)
+
+
+    def run(self):
+        self.logger.info('Starting Gist Scraper')
+        self.cookies()
+        self.pagination()
+        self.scraping()
+        return self.raw()
+
+    @property
+    def random_headers(self):
+        return {
+            'User-Agent': choice(self.desktop_agents),
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+        }
+
+    def cookies(self):
+
+        self.logger.info('Setting GIST cookies')
+
+        with requests.Session() as self.session:
+            self.headers = self.random_headers
+
+            request = self.session.get(self.url, headers=self.headers)
+
+            if request.status_code == 200:
+                pass
+            else:
+                self.logger.error('No Response from GIST')
+
+    def pagination(self):
+        request = self.session.get(
+            f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)
+        self.soup = BeautifulSoup(request.content, features="lxml")
+
+        pages = []
+        self.urls = [self.url]
+        try:
+            for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):
+                pages.append(pagination.get_text())
+        except:
+            pages = False
+
+        if pages:
+            cont = 2
+            while cont <= 1:  # int(pages[-2]):
+                cont += 1
+                full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"
+                self.urls.append(full_url)
+
+    def scraping(self):
+        url = []
+        for inurl in self.urls:
+            self.logger.info(f"Connecting to {inurl}")
+            time.sleep(5)
+            request = self.session.get(inurl, headers=self.headers)
+
+            if request.status_code == 200:
+                soup = BeautifulSoup(request.content, features="lxml")
+                for code in soup.findAll('div', {'class': 'gist-snippet'}):
+                    if '.onion' in code.get_text().lower():
+                        for raw in code.findAll('a', {'class': 'link-overlay'}):
+                            try:
+                                url.append(raw['href'])
+                            except:
+                                pass
+            self.urls_raw = []
+            for get in url:
+                self.logger.info(f"Connecting to {get}")
+                time.sleep(5)
+                try:
+                    request = self.session.get(get, headers=self.headers)
+
+                    if request.status_code == 200:
+                        soup = BeautifulSoup(request.content, features="lxml")
+
+                        for raw in soup.findAll('a', {'class': 'btn btn-sm'}):
+                            try:
+                                gist_url = f"https://gist.githubusercontent.com{raw['href']}"
+
+                                self.urls_raw.append(gist_url)
+
+                            except:
+                                pass
+                except(requests.exceptions.ConnectionError,
+                       requests.exceptions.ChunkedEncodingError,
+                       requests.exceptions.ReadTimeout,
+                       requests.exceptions.InvalidURL) as e:
+                    self.logger.error(
+                        f"I was unable to connect to the url, because an error occurred.\n{e}")
+                    pass
+
+    def raw(self):
+        self.logger.info('Performing replaces and regex. WAIT...')
+        itens = []
+        onions = []
+        for raw in self.urls_raw:
+            if '.txt' in raw.lower() \
+                    or '.csv' in raw.lower():
+                time.sleep(5)
+                request = self.session.get(raw, headers=self.headers)
+                self.soup = BeautifulSoup(request.content, features="lxml")
+                for pre in self.soup.findAll('body'):
+                    list = pre.get_text().split('\n')
+                    itens.extend(list)
+
+                regex = re.compile(
+                    "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
+
+                for lines in itens:
+                    rurls = lines \
+                        .replace('\xad', '') \
+                        .replace('\n', '') \
+                        .replace("http://", '') \
+                        .replace("https://", '') \
+                        .replace("www.", "")
+
+                    url = regex.match(rurls)
+
+                    if url is not None:
+                        onions.append(self.onion(url=url.group(), source='gist', type='domain'))
+        return onions
--- a/onioningestor/sources/reddit.py
+++ b/onioningestor/sources/reddit.py
@ -0,0 +1,120 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+__author__ = 'Andrey Glauzer'
+__license__ = "MIT"
+__version__ = "1.0.1"
+__maintainer__ = "Andrey Glauzer"
+__status__ = "Development"
+
+import requests
+import json
+import re
+import logging
+import re
+import urllib.parse
+from random import choice
+from bs4 import BeautifulSoup
+
+
+class Reddit:
+    def __init__(self):
+        self.session = requests.session()
+
+        self.source = 'Reddit'
+
+        self.url = 'https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000'
+        self.desktop_agents = [
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
+
+    @property
+    def random_headers(self):
+        return {
+            'User-Agent': choice(self.desktop_agents),
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+        }
+
+    @property
+    def start(self):
+        self.reddit_json()
+
+    def reddit_json(self):
+        print('Getting Reddit API information')
+        onionurl = []
+        try:
+            request = self.session.get(self.url,  headers=self.random_headers)
+
+            loaded_json = json.loads(request.content)
+
+            print(
+                'Filtering the URLs that have the word .onion in the text')
+            for data in loaded_json['data']:
+                reddit_url = 'https://www.reddit.com{}'.format(
+                    data['permalink'])
+                try:
+                    request = self.session.get(
+                        reddit_url,  headers=self.random_headers)
+                    soup = BeautifulSoup(request.content, features="lxml")
+
+                    for raw in soup.findAll('a', {'rel': 'nofollow'}):
+                        if 'https://' in raw['href']:
+                            raw_text = self.raw(url=raw['href'])
+                            if raw_text is not None:
+                                print(
+                                    'Applying REGEX. Wait...')
+                                regex = re.compile(
+                                    "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
+
+                                for lines in raw_text.split('\n'):
+                                    rurls = lines \
+                                        .replace('\xad', '') \
+                                        .replace('\n', '') \
+                                        .replace("http://", '') \
+                                        .replace("https://", '') \
+                                        .replace(r'\s', '') \
+                                        .replace('\t', '')
+
+                                    xurl = regex.match(rurls)
+                                    if xurl is not None:
+                                        onionurl.append(xurl.group())
+
+                except(requests.exceptions.ConnectionError,
+                       requests.exceptions.ChunkedEncodingError,
+                       requests.exceptions.ReadTimeout,
+                       requests.exceptions.InvalidURL) as e:
+                    print(
+                        'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
+
+        except(requests.exceptions.ConnectionError,
+               requests.exceptions.ChunkedEncodingError,
+               requests.exceptions.ReadTimeout,
+               requests.exceptions.InvalidURL) as e:
+            print(
+                'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
+
+        return onionurl
+
+    def raw(self, url):
+        try:
+            if url is not None:
+                request = self.session.get(url, headers=self.random_headers)
+                print(
+                    'Connecting in {url} - {status}'.format(url=url, status=request.status_code))
+
+                if request.status_code == 200:
+
+                    soup = BeautifulSoup(request.content, features="lxml")
+                    for s in soup(['script', 'style']):
+                        s.decompose()
+
+                    return ' '.join(soup.stripped_strings)
+
+        except (requests.exceptions.ConnectionError,
+                requests.exceptions.ChunkedEncodingError,
+                requests.exceptions.ReadTimeout,
+                requests.exceptions.TooManyRedirects) as e:
+            pass
+
+if __name__ == '__main__':
+    app = Reddit()
+    app.start
--- a/onioningestor/sources/simplefile.py
+++ b/onioningestor/sources/simplefile.py
@ -0,0 +1,31 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+__author__ = 'Andrey Glauzer'
+__license__ = "MIT"
+__version__ = "1.0.1"
+__maintainer__ = "Andrey Glauzer"
+__status__ = "Development"
+
+import requests
+from pathlib import Path
+
+from onioningestor.sources import Source
+
+
+class Plugin(Source):
+
+    def __init__(self, logger, name, filename):
+        self.logger = logger
+        self.name = name
+        self.filename = filename
+        super().__init__(self)
+
+
+    def run(self):
+        filepath = Path(__file__).parents[2]/self.filename
+        with open(filepath, 'r') as fp:
+            lines = fp.read().splitlines()
+        for onion in lines:
+            yield self.onion(url=onion,source='simple-file',type='domain')
+