From e31f149af05fafe9edbcc124f53585b611c68078 Mon Sep 17 00:00:00 2001 From: danieleperera Date: Mon, 6 Jul 2020 22:45:06 +0000 Subject: [PATCH] omitted emails on README --- .gitignore | 5 +- README.md | 63 +-- example.yml | 48 +- onionscraper/__init__.py | 131 ----- onionscraper/__main__.py | 50 -- onionscraper/config.py | 170 ------ onionscraper/dbhandler.py | 774 ---------------------------- onionscraper/loghandler.py | 33 -- onionscraper/operators/__init__.py | 78 --- onionscraper/operators/onionscan.py | 259 ---------- onionscraper/operators/yara.py | 15 - onionscraper/sources/__init__.py | 41 -- onionscraper/sources/gist.py | 153 ------ requirements.txt | 2 - setup.py | 4 +- 15 files changed, 35 insertions(+), 1791 deletions(-) delete mode 100644 onionscraper/__init__.py delete mode 100644 onionscraper/__main__.py delete mode 100644 onionscraper/config.py delete mode 100644 onionscraper/dbhandler.py delete mode 100644 onionscraper/loghandler.py delete mode 100644 onionscraper/operators/__init__.py delete mode 100644 onionscraper/operators/onionscan.py delete mode 100644 onionscraper/operators/yara.py delete mode 100644 onionscraper/sources/__init__.py delete mode 100644 onionscraper/sources/gist.py diff --git a/.gitignore b/.gitignore index 3c7ae9b..8a26afc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,11 @@ +onion_master_list.* webui templates -OnionScraper.egg-info +OnionIngestor.egg-info screenshots dump.rdb onionscandb -config.ini +config.yml *.log *.pyc __pycache__ diff --git a/README.md b/README.md index 49c6e77..89a481c 100644 --- a/README.md +++ b/README.md @@ -188,68 +188,7 @@ The output of the result is json, and in the same format it is sent to the chose "relatedOnionDomains": null, "ipAddresses": null, "emailAddresses": [ - "hitman001@torbox3uiot6wchz.onion", - "jimmym0reno@yahoo.com", - "aimeerene1977@gmail.com", - "jennabrown15.jb@gmail.com", - "S.thames129@gmail.com", - "munira025@gmail.com", - "luisadavid20@gmail.com", - "cameron.stewart3@yahoo.com", - "janisea2013@gmail.com", - "Carinavieyra598@gmail.com", - "adrianmcdonald49@gmail.com", - "aaronjeans1@gmail.com", - "nsorrentino11@aol.com", - "amber4189@outlook.com", - "holliekestner@gmail.com", - "nattyperks01@gmail.com", - "dinavasa29@hotmail.com", - "lydiac612@gmail.com", - "bmduke24@gmail.com", - "markigharmony@gmail.com", - "ohdannyboy03@icloud.com", - "dkoontz18@gmail.com", - "janese_young@yahoo.com", - "gabssstobsss@gmail.com", - "thelake02@sbcglobal.net", - "timmyboston01@gmail.com", - "carloscharters1996@gmail.com", - "djamila28@outlook.com", - "heathermaeb@gmail.com", - "canelo2080@gmail.com", - "pamsanta.ps@gmail.com", - "horeka.mash98@gmail.com", - "oeh@gondtc.com", - "ohmygod990227@hotmail.com", - "marieazme@yahoo.com", - "shirleyteuta@gmail.com", - "janetcoppedge@sbcglobal.net", - "dimashilov30@gmail.com", - "benavides.kam@gmail.com", - "sonyainsonora@yahoo.com", - "benl04123@outlook.com", - "cmculbreath@fedex.com", - "antmeb@gmail.com", - "jrlopez61@hotmail.com", - "jaimie.mudge@hotmail.com", - "dreamworld1980@secmail.pro", - "tinajones@sympatico.ca", - "nobby@secmail.pro", - "twistedsun@secmail.pro", - "slayermodsv3@gmail.com", - "beastmodsv1@gmail.com", - "prestonkonicek@gmail.com", - "fnbrleaksv2@gmail.com", - "fnbrleaks@gmail.com", - "pushingeverythingyt@gmail.com", - "rachelkonicek@gmail.com", - "vsfortune@hotmail.com", - "dannajoywhite@gmail.com", - "jensenjody@gmail.com", - "jenniferjbisschop@gmail.com", - "hkbergado@gmail.com", - "mummifiedbabies@secmail.pro" + OMMITTED ], "analyticsIDs": null, "bitcoinAddresses": [ diff --git a/example.yml b/example.yml index 3edcb01..f27b086 100644 --- a/example.yml +++ b/example.yml @@ -3,8 +3,11 @@ general: # Run forever, check feeds once an hour. - daemon: False - sleep: 3600 + daemon: True + sleep: 10 + onion_validation: ([a-z2-7]{16,56}\.onion) + blacklist: pedo,xxx,infant,loli,porn,child,abuse,sex,drug,cocaine,dope,zoo,daddy,daughter,boy,girl,young,muder,cocks,year,old + interestingKeywords: t.me,feed,rss,xml,atom,dataleak,breach,blog,ransomware,source code,data breach elasticsearch: index: darkweb port : 9200 @@ -12,9 +15,13 @@ general: sources: # A few threat intel blogs to get you started! - - name: source-gist - module: gist - url: https://gist.github.com/search?l=Text&q=.onion + - name: simple-text-file + module: simplefile + filename: onion_master_list.txt + + # - name: source-gist + # module: gist + # url: https://gist.github.com/search?l=Text&q=.onion # - name: source-reddit # module: reddit @@ -43,20 +50,23 @@ sources: operators: - - name: onionscan-go - module: onionscan - binpath: /home/tony/go/bin/onionscan - socks5: - http: 'socks5h://127.0.0.1:9050' - https: 'socks5h://127.0.0.1:9050' - TorController: - port: 9051 - password: Xk5QP2haFMh8Y8D1060F1D7xaWEFG - timeout: 300 - retries: 2 - screenshots_path: null - blacklist: pedo,xxx,infant,loli,porn,child,abuse,sex,drug,cocaine,dope,zoo,daddy,daughter,boy,girl,young,muder - interestingKeywords: t.me,feed,rss,xml,atom,dataleak,breach,blog,ransomware,source code,data breach + - name: simple-html + module: html + socks5: + http: 'socks5h://127.0.0.1:9050' + https: 'socks5h://127.0.0.1:9050' + TorController: + port: 9051 + password: your-torcontroller-password-here + + - name: simple-screenshot + module: screenshot + screenshots_path: null + + - name: onionscan-go + module: onionscan + binpath: /home/tony/go/bin/onionscan + # - name: yara-rule # module: yara diff --git a/onionscraper/__init__.py b/onionscraper/__init__.py deleted file mode 100644 index af3a006..0000000 --- a/onionscraper/__init__.py +++ /dev/null @@ -1,131 +0,0 @@ -import sys -import time -import traceback -import collections - -from . import config -from . import dbhandler -from . import loghandler - - -class OnionManager: - """ThreatIngestor main work logic. - - Handles reading the config file, calling sources, maintaining state, and - sending artifacts to operators. - """ - def __init__(self, args): - # Load logger - log = loghandler.LoggerHandler(args.logLevel) - self.logger = log.start_logging() - # Load config - self.config = config.Config(args.configFile, self.logger) - - - # Load Elasticsearch. - try: - self.es = dbhandler.DbHandlerElasticSearch( - self.config.elasticsearch(), - self.logger) - except Exception as e: - # Error loading elasticsearch. - self.logger.error(e) - self.logger.debug(traceback.print_exc()) - sys.exit(1) - - - # Instantiate plugins. - try: - self.logger.info("Initializing sources") - self.sources = {name: source(self.logger, **kwargs) - for name, source, kwargs in self.config.sources()} - - self.logger.info("initializing operators") - self.operators = {name: operator(self.logger, **kwargs) - for name, operator, kwargs in self.config.operators()} - - self.logger.info("initializing notifiers") - #self.notifiers = {name: operator(**kwargs) - # for name, operator, kwargs in self.config.notifiers()} - except Exception as e: - # Error loading elasticsearch. - self.logger.error(e) - self.logger.debug(traceback.print_exc()) - sys.exit(1) - - - def run(self): - """Run once, or forever, depending on config.""" - if self.config.daemon(): - selfl.logger.info("Running forever, in a loop") - self.run_forever() - else: - self.logger.info("Running once, to completion") - self.run_once() - - - def run_once(self): - """Run each source once, passing artifacts to each operator.""" - # Track some statistics about artifacts in a summary object. - summary = collections.Counter() - - for source in self.sources: - # Run the source to collect artifacts. - self.logger.info(f"Running source '{source}'") - try: - onions = self.sources[source].run() - if onions: - self.logger.info(f'Found hidden links') - else: - self.logger.info('No links found') - except Exception as e: - self.logger.error(e) - self.logger.error(traceback.print_exc()) - continue - - # Process artifacts with each operator. - for operator in self.operators: - self.logger.info(f"Processing found onions with operator '{operator}'") - try: - doc = self.operators[operator].process(onions) - # Save the source state. - self.es.save(doc) - except Exception as e: - self.logger.error(e) - self.logger.error(traceback.print_exc()) - continue - - - -# # Record stats and update the summary. -# types = artifact_types(doc.get('interestingKeywords')) -# summary.update(types) -# for artifact_type in types: -# self.logger.info(f'types[artifact_type]') - - # Log the summary. - self.logger.info(f"New artifacts: {dict(summary)}") - - - def run_forever(self): - """Run forever, sleeping for the configured interval between each run.""" - while True: - self.run_once() - - self.logger.info(f"Sleeping for {self.config.sleep()} seconds") - time.sleep(self.config.sleep()) - - -def artifact_types(artifact_list): - """Return a dictionary with counts of each artifact type.""" - types = {} - for artifact in artifact_list: - artifact_type = artifact.__class__.__name__.lower() - if artifact_type in types: - types[artifact_type] += 1 - else: - types[artifact_type] = 1 - - return types - - diff --git a/onionscraper/__main__.py b/onionscraper/__main__.py deleted file mode 100644 index 6604d90..0000000 --- a/onionscraper/__main__.py +++ /dev/null @@ -1,50 +0,0 @@ -"""OnionScraper - -A Python3 application for indexing and scraping hidden services ElasticSearch - -Installation: - This application assumes you have python3 and pip3 installed. - - pip3 install -r requirements.txt - - -This software is provided subject to the MIT license stated below. --------------------------------------------------- - MIT License - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. --------------------------------------------------- -""" -import argparse - -from onionscraper import OnionManager - - -# Load arguments from user -parser = argparse.ArgumentParser( - prog='onionscraper', - description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter) -parser.add_argument('-c', '--config',dest="configFile", required = True, help='Path to config file') -parser.add_argument("--log", dest="logLevel",default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the logging level, default is INFO") - -args = parser.parse_args() - -app = OnionManager(args) - -app.run() diff --git a/onionscraper/config.py b/onionscraper/config.py deleted file mode 100644 index 18160c0..0000000 --- a/onionscraper/config.py +++ /dev/null @@ -1,170 +0,0 @@ -import io -import importlib -import traceback - -import yaml - -from pathlib import Path - -SOURCE = 'onionscraper.sources' -OPERATOR = 'onionscraper.operators' - -INTERNAL_OPTIONS = [ - 'saved_state', - 'module', - 'credentials', -] - -ARTIFACT_TYPES = 'artifact_types' -FILTER_STRING = 'filter' -ALLOWED_SOURCES = 'allowed_sources' -NAME = 'name' - - -class Config: - """Config read/write operations, and convenience methods.""" - def __init__(self, filename, logger): - """Read a config file.""" - self.logger = logger - self.filename = filename - with io.open(self.filename, 'r') as f: - try: - self.logger.info("Loading config file") - self.config = yaml.safe_load(f.read()) - except yaml.error.YAMLError: - self.logger.error("YAML error in config") - - - @staticmethod - def _load_plugin(plugin_type, plugin): - """Returns plugin class or raises an exception. - :raises: threatingestor.exceptions.PluginError - """ - try: - module = importlib.import_module('.'.join([plugin_type, plugin])) - return module.Plugin - except Exception as e: - print(e) - print(traceback.print_exc()) - - def daemon(self): - """Returns boolean, are we daemonizing?""" - return self.config['general']['daemon'] - - - def elasticsearch(self): - """Returns elasticsaerch config""" - return self.config['general']['elasticsearch'] - - - def sleep(self): - """Returns number of seconds to sleep between iterations, if daemonizing.""" - return self.config['general']['sleep'] - - -# def onionscanner(self): -# """Returns onionscanner config dict""" -# screenshots = self.config['onionscanner'].pop('screenshots_path', None) -# if screenshots: -# self.config['onionscanner']['screenshots_path'] = Path(screenshots) -# else: -# self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots' -# blacklist = self.config['onionscanner'].pop('blacklist', None) -# if blacklist: -# self.config['onionscanner']['blacklist'] = blacklist.split(',') -# interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None) -# if interestingKeywords: -# self.config['onionscanner']['interestingKeywords'] = blacklist.split(',') -# return self.config['onionscanner'] - - - def notifiers(self): - """Returns notifiers config dictionary.""" - return self.config.get('notifiers', {}) - - - def logging(self): - """Returns logging config dictionary.""" - return self.config.get('logging', {}) - - - def credentials(self, credential_name): - """Return a dictionary with the specified credentials.""" - for credential in self.config['credentials']: - for key, value in credential.items(): - if key == NAME and value == credential_name: - return credential - return {} - - - def sources(self): - """Return a list of (name, Source class, {kwargs}) tuples. - :raises: threatingestor.exceptions.PluginError - """ - sources = [] - - for source in self.config['sources']: - kwargs = {} - for key, value in source.items(): - if key not in INTERNAL_OPTIONS: - kwargs[key] = value - - elif key == 'credentials': - # Grab these named credentials - credential_name = value - for credential_key, credential_value in self.credentials(credential_name).items(): - if credential_key != NAME: - kwargs[credential_key] = credential_value - - # load and initialize the plugin - self.logger.info(f"Found source '{source[NAME]}'") - sources.append((source[NAME], self._load_plugin(SOURCE, source['module']), kwargs)) - - self.logger.info(f"Found {len(sources)} total sources") - return sources - - - def operators(self): - """Return a list of (name, Operator class, {kwargs}) tuples. - :raises: threatingestor.exceptions.PluginError - """ - operators = [] - for operator in self.config['operators']: - kwargs = {} - for key, value in operator.items(): - if key not in INTERNAL_OPTIONS: - if key == ARTIFACT_TYPES: - # parse out special artifact_types option - artifact_types = [] - for artifact in value: - try: - artifact_types.append(threatingestor.artifacts.STRING_MAP[artifact.lower().strip()]) - except KeyError: - # ignore invalid artifact types - pass - kwargs[key] = artifact_types - - elif key == FILTER_STRING: - # pass in special filter_string option - kwargs['filter_string'] = value - - elif key == NAME: - # exclude name key from operator kwargs, since it's not used - pass - - else: - kwargs[key] = value - - elif key == 'credentials': - # Grab these named credentials - credential_name = value - for credential_key, credential_value in self.credentials(credential_name).items(): - if credential_key != NAME: - kwargs[credential_key] = credential_value - - # load and initialize the plugin - self.logger.info(f"Found operator '{operator[NAME]}'") - operators.append((operator[NAME], self._load_plugin(OPERATOR, operator['module']), kwargs)) - - self.logger.info(f"Found {len(operators)} total operators") - return operators diff --git a/onionscraper/dbhandler.py b/onionscraper/dbhandler.py deleted file mode 100644 index 0c50594..0000000 --- a/onionscraper/dbhandler.py +++ /dev/null @@ -1,774 +0,0 @@ -import sys -import traceback - -from elasticsearch import Elasticsearch, helpers - -class DbHandlerElasticSearch: - def __init__(self, config, logger): - self.logger = logger - self.logger.info('Creating Elasticsearch mapping') - self.config = config - self.mapping = ''' - { - "mappings": { - "_doc": { - "properties": { - "html": { - "type": "text" - }, - "onionscan": { - "type": "nested", - "properties": { - "bitcoinDetected": { - "type": "boolean" - }, - "bitcoinServices": { - "properties": { - "bitcoin": { - "properties": { - "detected": { - "type": "boolean" - }, - "prototocolVersion": { - "type": "long" - }, - "userAgent": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "bitcoin_test": { - "properties": { - "detected": { - "type": "boolean" - }, - "prototocolVersion": { - "type": "long" - }, - "userAgent": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "dogecoin": { - "properties": { - "detected": { - "type": "boolean" - }, - "prototocolVersion": { - "type": "long" - }, - "userAgent": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "litecoin": { - "properties": { - "detected": { - "type": "boolean" - }, - "prototocolVersion": { - "type": "long" - }, - "userAgent": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - } - } - }, - "certificates": { - "type": "nested", - "properties": { - "AuthorityKeyId": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "BasicConstraintsValid": { - "type": "boolean" - }, - "CRLDistributionPoints": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "DNSNames": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "ExtKeyUsage": { - "type": "long" - }, - "Extensions": { - "properties": { - "Critical": { - "type": "boolean" - }, - "Id": { - "type": "long" - }, - "Value": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "IsCA": { - "type": "boolean" - }, - "Issuer": { - "properties": { - "CommonName": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "Country": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "Locality": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "Names": { - "properties": { - "Type": { - "type": "long" - }, - "Value": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "Organization": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "OrganizationalUnit": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "Province": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "SerialNumber": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "IssuingCertificateURL": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "KeyUsage": { - "type": "long" - }, - "MaxPathLen": { - "type": "long" - }, - "MaxPathLenZero": { - "type": "boolean" - }, - "NotAfter": { - "type": "date" - }, - "NotBefore": { - "type": "date" - }, - "OCSPServer": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "PermittedDNSDomainsCritical": { - "type": "boolean" - }, - "PolicyIdentifiers": { - "type": "long" - }, - "PublicKey": { - "properties": { - "E": { - "type": "text" - }, - "N": { - "type": "text" - } - } - }, - "PublicKeyAlgorithm": { - "type": "long" - }, - "Raw": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "RawIssuer": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "RawSubject": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "RawSubjectPublicKeyInfo": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "RawTBSCertificate": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "SerialNumber": { - "type": "text" - }, - "Signature": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "SignatureAlgorithm": { - "type": "long" - }, - "Subject": { - "properties": { - "CommonName": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "Country": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "Locality": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "Names": { - "properties": { - "Type": { - "type": "long" - }, - "Value": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "Organization": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "OrganizationalUnit": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "Province": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "SerialNumber": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "SubjectKeyId": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "Version": { - "type": "long" - } - } - }, - "crawls": { - "type": "nested", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "dateScanned": { - "type": "date" - }, - "f_name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "ftpBanner": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "ftpDetected": { - "type": "boolean" - }, - "ftpFingerprint": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "hiddenService": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "identifierReport": { - "properties": { - "analyticsIDs": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "bitcoinAddresses": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "emailAddresses": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "exifImages": { - "properties": { - "exifTags": { - "properties": { - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "value": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "location": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "foundApacheModStatus": { - "type": "boolean" - }, - "linkedOnions": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "openDirectories": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "privateKeyDetected": { - "type": "boolean" - }, - "serverVersion": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "ircDetected": { - "type": "boolean" - }, - "lastAction": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "mongodbDetected": { - "type": "boolean" - }, - "online": { - "type": "boolean" - }, - "performedScans": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "pgpKeys": { - "properties": { - "armoredKey": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "fingerprint": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "identity": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - }, - "ricochetDetected": { - "type": "boolean" - }, - "skynetDetected": { - "type": "boolean" - }, - "smtpBanner": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "smtpDetected": { - "type": "boolean" - }, - "smtpFingerprint": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "sshBanner": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "sshDetected": { - "type": "boolean" - }, - "sshKey": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "timedOut": { - "type": "boolean" - }, - "tlsDetected": { - "type": "boolean" - }, - "vncDetected": { - "type": "boolean" - }, - "webDetected": { - "type": "boolean" - }, - "xmppDetected": { - "type": "boolean" - } - } - }, - "screenshots": { - "type": "nested", - "properties": { - "dateScreenshoted": { - "type": "date" - }, - "filename": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - } - } - } - } -} - ''' - try: - self.es = Elasticsearch([{ - 'host':self.config['host'], - 'port':self.config['port']}]) - self.es.indices.create( - index=self.config['index'], - body=self.mapping, - ignore=400) - except Exception as e: - self.logger.error(e) - self.logger.error(traceback.format_exc()) - sys.exit(0) - - def count(self): - self.es.indices.refresh(self.index) - status = self.es.count(index=self.index) - if status['_shards']['successful'] == 1: - self.logger.info('Successful') - self.logger.info('Count:%d',status['count']) - else: - self.logger.error(status) - - def save(self, doc): - self.es.index(index=self.index,body=doc) - self.count() diff --git a/onionscraper/loghandler.py b/onionscraper/loghandler.py deleted file mode 100644 index 1189b4e..0000000 --- a/onionscraper/loghandler.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -import logging -from pathlib import Path - -class LoggerHandler(): - def __init__(self, level): - self.level = getattr(logging, level) - self.logger = logging.getLogger("OnionScraper") - self.logger.setLevel(self.level) - - # create console handler and set level to debug - ch = logging.StreamHandler() - ch.setLevel(self.level) - - # create file logging - logFile = Path(__file__).parents[1] - logging_path = os.path.join(logFile, "info.log") - fh = logging.FileHandler(logging_path) - - # create formatter - formatter = logging.Formatter('[%(asctime)s] - %(name)s - %(levelname)s - %(message)s',datefmt='%a, %d %b %Y %H:%M:%S') - formatter_console = logging.Formatter('[%(asctime)s] - %(levelname)s - %(message)s',datefmt='%d %b %Y %H:%M:%S') - # add formatter to ch - ch.setFormatter(formatter_console) - fh.setFormatter(formatter) - # add ch to logger - self.logger.addHandler(ch) #added logging into console - self.logger.addHandler(fh) #added logging into file - - def start_logging(self): - self.logger.info('Starting OnionScraper') - return self.logger - diff --git a/onionscraper/operators/__init__.py b/onionscraper/operators/__init__.py deleted file mode 100644 index 8340189..0000000 --- a/onionscraper/operators/__init__.py +++ /dev/null @@ -1,78 +0,0 @@ -import re - - -class Operator: - """Base class for all Operator plugins. - - Note: This is an abstract class. You must extend ``__init__`` and call - ``super`` to ensure this class's constructor is called. You must override - ``handle_artifact`` with the same signature. You may define additional - ``handle_{artifact_type}`` methods as needed (see the threatkb operator for - an example) - these methods are purely convention, and are not required. - - When adding additional methods to child classes, consider prefixing the - method name with an underscore to denote a ``_private_method``. Do not - override other existing methods from this class. - """ - def __init__(self, artifact_types=None, filter_string=None, allowed_sources=None): - """Override this constructor in child classes. - - The arguments above (artifact_types, filter_string, allowed_sources) - should be accepted explicity as above, in all child classes. - - Additional arguments should be added: url, auth, etc, whatever is - needed to set up the object. - - Each operator should default self.artifact_types to a list of Artifacts - supported by the plugin, and allow passing in artifact_types to - overwrite that default. - - Example: - - >>> self.artifact_types = artifact_types or [ - ... artifacts.IPAddress, - ... artifacts.Domain, - ... ] - - It's recommended to call this __init__ method via super from all child - classes. Remember to do so *before* setting any default artifact_types. - """ - self.artifact_types = artifact_types or [] - self.filter_string = filter_string or '' - self.allowed_sources = allowed_sources or [] - - - def handle_onion(self, url): - """Override with the same signature. - - :param artifact: A single ``Artifact`` object. - :returns: None (always ignored) - """ - raise NotImplementedError() - - - def _artifact_is_allowed(self, artifact): - """Returns True if this is allowed by this plugin's filters.""" -# # Must be in allowed_types. -# if not any(isinstance(artifact, t) for t in self.artifact_types): -# return False -# -# # Must match the filter string. -# if not artifact.match(self.filter_string): -# return False -# -# # Must be in allowed_sources, if set. -# if self.allowed_sources and not any( -# [re.compile(p).search(artifact.source_name) -# for p in self.allowed_sources]): -# return False -# - return True - - - def process(self, onions): - """Process all applicable onions.""" - for onion in onions: - if self._artifact_is_allowed(onion.url): - self.handle_onion(onion) - diff --git a/onionscraper/operators/onionscan.py b/onionscraper/operators/onionscan.py deleted file mode 100644 index 94b0bad..0000000 --- a/onionscraper/operators/onionscan.py +++ /dev/null @@ -1,259 +0,0 @@ -import re -import os -import sys -import json -import time -import random -import traceback -import subprocess -from uuid import uuid4 -from pathlib import Path -from datetime import datetime as dt -from json.decoder import JSONDecodeError -from concurrent.futures import ProcessPoolExecutor -from threading import Timer - -import requests - -from stem.control import Controller -from stem import Signal - -from selenium import webdriver -from selenium.webdriver.firefox.options import Options -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary - -from onionscraper.operators import Operator - -class Plugin(Operator): - """OnionScraper main work logic. - - Handles reading the config file, calling sources, maintaining state and - sending artifacts to operators. - """ - def __init__(self, logger, **kwargs): - self.logger = logger - self.logger.info('Initializing OnionScanner') - screenshots = kwargs.pop('screenshots_path', None) - if screenshots: - self.screenshots = Path(screenshots) - else: - self.screenshots = Path(__file__).parents[1]/'screenshots' - self.onionscan = kwargs['binpath'] - self.timeout = int(kwargs['timeout']) - self.proxy = kwargs['socks5'] - self.torControl = kwargs['TorController'] - self.retries = int(kwargs['retries']) - self.headers ={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language':'en-US,en;q=0.5', - 'DNT': '1', 'Connection': - 'keep-alive', - 'Upgrade-Insecure-Requests': '1'} - - - blacklist = kwargs['blacklist'].split(',') - self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE) - keywords = kwargs['interestingKeywords'].split(',') - self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE) - self.session = self.get_tor_session() - - def response(self, status, content, onion): - """ - status: success/failure - content: dict - onion: str - return: dict - """ - return {'status': status, 'data': content, 'onion': onion} - - def parseDoc(self, data): - data['onionscan'].pop('simpleReport', None) - crawls = data['onionscan'].pop('crawls', None) - hiddenService = data['onionscan'].pop('hiddenService', None) - data['onionscan']['crawls'] = [*crawls] - data['hiddenService'] = hiddenService - for onion in crawls.keys(): - print(onion) - #q.enqueue(self.crawl, onion) - #with open('test.json', 'w', encoding='utf-8') as f: - # json.dump(data, f, ensure_ascii=False, indent=4) - return data - - def format_directory(self, directory): - d = dt.now() - year = str(d.year) - month = str(d.month) - # prefix month and day with "0" if it is only one digit - if len(month) < 2: - month = "0" + month - day = str(d.day) - if len(day) < 2: - day = "0" + day - save_path = directory/year/month/day - if not os.path.isdir(save_path): - self.logger.info("[*] Creating directory to save screenshots") - os.makedirs(save_path) - - return save_path - - def take_screenshot(self, save_path, onion): - binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver') - fp = webdriver.FirefoxProfile() - fp.set_preference('network.proxy.type', 1) - fp.set_preference('network.proxy.socks', '127.0.0.1') - fp.set_preference('network.proxy.socks_port', 9050) - fp.set_preference('network.proxy.socks_remote_dns', True) - - options = Options() - options.headless = True - driver = webdriver.Firefox( - executable_path='/home/tony/Projects/OnionScraper/geckodriver', - options=options, - firefox_profile=fp) - url = 'http://' + onion - driver.get(url) - uid = str(uuid4()).split('-')[0] - filename = f"{onion}_screenshot_{uid}.png" - f_name = f"{save_path}/{filename}" - driver.save_screenshot(f_name) - - driver.quit() - - if os.path.isfile(f_name): - self.logger.info(f'[*] Screenshot was taken. {f_name}') - dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z' - result = {'dateScreenshoted':dateScreenshoted,'filename':filename} - return self.response("success",result,onion) - else: - self.logger.error('[x] Unable to take screenshot') - return self.response("failure",None,onion) - - - - def get_tor_session(self): - try: - s = requests.session() - s.proxies = self.proxy - s.headers.update(self.headers) - except Exception as e: - self.logger.error(e) - self.logger.debug(traceback.print_exc()) - return s - - # signal TOR for a new connection - def renew_connection(self): - with Controller.from_port(port = self.torControl['port']) as controller: - # Now we switch TOR identities to make sure we have a good connection - self.logger.info('Getting new Tor IP') - # authenticate to our local TOR controller - controller.authenticate(self.torControl['password']) - # send the signal for a new identity - controller.signal(Signal.NEWNYM) - # wait for the new identity to be initialized - time.sleep(controller.get_newnym_wait()) - session = self.get_tor_session() - self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}") - - def handle_timeout(self, process, onion): - # - # Handle a timeout from the onionscan process. - # - - try: - # kill the onionscan process - process.kill() - self.logger.info("[!!!] Killed the onionscan process.") - except: - pass - self.renew_connection() - return - - def run_sessions(self, onion): - retry = 0 - result = None - while True: - try: - url = 'http://'+onion - self.logger.info(url) - content = self.session.get(url) - if content.status_code == 200: - result = content.json() - except JSONDecodeError as e: - self.logger.debug(f'JSONDecodeError {e}') - result = content.text - except Exception as e: - self.logger.error(e) - self.logger.debug(traceback.print_exc()) - finally: - if result: - return self.response("success",result,onion) - else: - self.logger.info('[x] No results found retrying ...') - retry += 1 - self.renew_connection() - if retry > self.retries: - self.logger.error('[x] Max retries exceeded') - return self.response("failure",None, onion) - - def run_onionscan(self, onion): - self.logger.info("[*] Running onionscan on %s", onion) - - # fire up onionscan - process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE) - - # start the timer and let it run till timeout minutes - process_timer = Timer(300,self.handle_timeout,args=[process,onion]) - process_timer.start() - - # wait for the onion scan results - stdout = process.communicate()[0] - - # we have received valid results so we can kill the timer - if process_timer.is_alive(): - process_timer.cancel() - return self.response("success",json.loads(stdout),onion) - - self.logger.info("[!!!] Process timed out for %s", onion) - - return self.response("failure",None, onion) - - def handle_onion(self, onion_tuple): - onion = onion_tuple.url - self.logger.info(f'Processing {onion} with onionscan') - try: - blacklist_URL = self.blacklist.search(onion) - if blacklist_URL: - self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}") - else: - self.logger.debug("[*] URL blacklist test: PASSED") - results = self.run_onionscan(onion) - if results['status'] == 'success' and results['data']['webDetected'] == 'true': - content = self.run_sessions(onion) - if content['status'] == 'success': - blacklist_CONTENT = self.blacklist.search(content['data']) - if blacklist_CONTENT: - self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}") - else: - self.logger.debug("[*] CONTENT blacklist test: PASSED") - screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion) - self.logger.info("Indexing!") - doc = { - 'onionscan':json.loads(results['data']), - 'html':content['data'], - 'screenshots':screenshot['data'], - 'interestingKeywords':self.interestingKeywords.findall(content['data']) - } - return self.parseDoc(doc) - - else: - self.logger.info(f"[x] hidden service {onion} is not active") - except Exception as e: - self.logger.error(e) - self.logger.error(traceback.print_exc()) - finally: - pass - #sys.exit(0) - - - diff --git a/onionscraper/operators/yara.py b/onionscraper/operators/yara.py deleted file mode 100644 index 794c093..0000000 --- a/onionscraper/operators/yara.py +++ /dev/null @@ -1,15 +0,0 @@ - -from onionscraper.operators import Operator - -class Plugin(Operator): - """Operator for output to flat CSV file.""" - def __init__(self, filename, base_score): - """CSV operator.""" - self.filename = filename - - #super(Plugin, self).__init__(artifact_types, filter_string, allowed_sources) - - - def handle_artifact(self, artifact): - """Operate on a single artifact.""" - pass diff --git a/onionscraper/sources/__init__.py b/onionscraper/sources/__init__.py deleted file mode 100644 index 14bb4e4..0000000 --- a/onionscraper/sources/__init__.py +++ /dev/null @@ -1,41 +0,0 @@ -from collections import namedtuple - -class Source(object): - """Base class for all Source plugins. - Note: This is an abstract class. You must override ``__init__`` and ``run`` - in child classes. You should not override ``process_element``. When adding - additional methods to child classes, consider prefixing the method name - with an underscore to denote a ``_private_method``. - """ - def __init__(self, name, *args, **kwargs): - """Override this constructor in child classes. - The first argument must always be ``name``. - Other argumentss should be url, auth, etc, whatever is needed to set - up the object. - """ - self.onion = namedtuple('onion', ['url','source','type']) - - - def run(self): - """Run and return ``(saved_state, list(Artifact))``. - Override this method in child classes. - The method signature and return values must remain consistent. - The method should attempt to pick up where we left off using - ``saved_state``, if supported. If ``saved_state`` is ``None``, you can - assume this is a first run. If state is maintained by the remote - resource (e.g. as it is with SQS), ``saved_state`` should always be - ``None``. - """ - raise NotImplementedError() - - - def process_element(self, content, reference_link, include_nonobfuscated=False): - """Take a single source content/url and return a list of Artifacts. - This is the main work block of Source plugins, which handles - IOC extraction and artifact creation. - :param content: String content to extract from. - :param reference_link: Reference link to attach to all artifacts. - :param include_nonobfuscated: Include non-defanged URLs in output? - """ - logger.debug(f"Processing in source '{self.name}'") - diff --git a/onionscraper/sources/gist.py b/onionscraper/sources/gist.py deleted file mode 100644 index 15e2049..0000000 --- a/onionscraper/sources/gist.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- - -__author__ = 'Andrey Glauzer' -__license__ = "MIT" -__version__ = "1.0.1" -__maintainer__ = "Andrey Glauzer" -__status__ = "Development" - -import requests -import json -import re -import re -import urllib.parse -from random import choice -import time -from bs4 import BeautifulSoup - - -from onionscraper.sources import Source - - -class Plugin(Source): - - def __init__(self, logger, name, url): - self.logger = logger - self.name = name - self.url = url - self.desktop_agents = [ - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0'] - super().__init__(self) - - - def run(self): - self.logger.info('Starting Gist Scraper') - self.cookies() - self.pagination() - self.scraping() - return self.raw() - - @property - def random_headers(self): - return { - 'User-Agent': choice(self.desktop_agents), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' - } - - def cookies(self): - - self.logger.info('Setting GIST cookies') - - with requests.Session() as self.session: - self.headers = self.random_headers - - request = self.session.get(self.url, headers=self.headers) - - if request.status_code == 200: - pass - else: - self.logger.error('No Response from GIST') - - def pagination(self): - request = self.session.get( - f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers) - self.soup = BeautifulSoup(request.content, features="lxml") - - pages = [] - self.urls = [self.url] - try: - for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'): - pages.append(pagination.get_text()) - except: - pages = False - - if pages: - cont = 2 - while cont <= 1: # int(pages[-2]): - cont += 1 - full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}" - self.urls.append(full_url) - - def scraping(self): - url = [] - for inurl in self.urls: - self.logger.info(f"Connecting to {inurl}") - time.sleep(5) - request = self.session.get(inurl, headers=self.headers) - - if request.status_code == 200: - soup = BeautifulSoup(request.content, features="lxml") - for code in soup.findAll('div', {'class': 'gist-snippet'}): - if '.onion' in code.get_text().lower(): - for raw in code.findAll('a', {'class': 'link-overlay'}): - try: - url.append(raw['href']) - except: - pass - self.urls_raw = [] - for get in url: - self.logger.info(f"Connecting to {get}") - time.sleep(5) - try: - request = self.session.get(get, headers=self.headers) - - if request.status_code == 200: - soup = BeautifulSoup(request.content, features="lxml") - - for raw in soup.findAll('a', {'class': 'btn btn-sm'}): - try: - gist_url = f"https://gist.githubusercontent.com{raw['href']}" - - self.urls_raw.append(gist_url) - - except: - pass - except(requests.exceptions.ConnectionError, - requests.exceptions.ChunkedEncodingError, - requests.exceptions.ReadTimeout, - requests.exceptions.InvalidURL) as e: - self.logger.error( - f"I was unable to connect to the url, because an error occurred.\n{e}") - pass - - def raw(self): - self.logger.info('Performing replaces and regex. WAIT...') - itens = [] - onions = [] - for raw in self.urls_raw: - if '.txt' in raw.lower() \ - or '.csv' in raw.lower(): - time.sleep(5) - request = self.session.get(raw, headers=self.headers) - self.soup = BeautifulSoup(request.content, features="lxml") - for pre in self.soup.findAll('body'): - list = pre.get_text().split('\n') - itens.extend(list) - - regex = re.compile( - "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") - - for lines in itens: - rurls = lines \ - .replace('\xad', '') \ - .replace('\n', '') \ - .replace("http://", '') \ - .replace("https://", '') \ - .replace("www.", "") - - url = regex.match(rurls) - - if url is not None: - onions.append(self.onion(url=url.group(), source='gist', type='domain')) - return onions diff --git a/requirements.txt b/requirements.txt index cbdc092..1a07b5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,8 +5,6 @@ click==7.1.2 elasticsearch==7.8.0 idna==2.10 lxml==4.5.1 -# Editable Git install with no remote (OnionScraper==1.0.0) --e /home/tony/Projects/OnionScraper PySocks==1.7.1 PyYAML==5.3.1 requests==2.24.0 diff --git a/setup.py b/setup.py index 76aa0fa..bdf3b1f 100644 --- a/setup.py +++ b/setup.py @@ -8,14 +8,14 @@ def readme_file_contents(): setup( - name='OnionScraper', + name='OnionIngestor', version='1.0.0', description='Python app to scraper and index hidden websites', long_description=readme_file_contents(), author='dan', author_email='test@google.com', license='MIT', - packages=['onionscraper'], + packages=['onioningestor'], zip_safe=False, install_requires=[] )