omitted emails on README

pull/1/head
danieleperera 4 years ago
parent 8258056bef
commit e31f149af0

5
.gitignore vendored

@ -1,10 +1,11 @@
onion_master_list.*
webui
templates
OnionScraper.egg-info
OnionIngestor.egg-info
screenshots
dump.rdb
onionscandb
config.ini
config.yml
*.log
*.pyc
__pycache__

@ -188,68 +188,7 @@ The output of the result is json, and in the same format it is sent to the chose
"relatedOnionDomains": null,
"ipAddresses": null,
"emailAddresses": [
"hitman001@torbox3uiot6wchz.onion",
"jimmym0reno@yahoo.com",
"aimeerene1977@gmail.com",
"jennabrown15.jb@gmail.com",
"S.thames129@gmail.com",
"munira025@gmail.com",
"luisadavid20@gmail.com",
"cameron.stewart3@yahoo.com",
"janisea2013@gmail.com",
"Carinavieyra598@gmail.com",
"adrianmcdonald49@gmail.com",
"aaronjeans1@gmail.com",
"nsorrentino11@aol.com",
"amber4189@outlook.com",
"holliekestner@gmail.com",
"nattyperks01@gmail.com",
"dinavasa29@hotmail.com",
"lydiac612@gmail.com",
"bmduke24@gmail.com",
"markigharmony@gmail.com",
"ohdannyboy03@icloud.com",
"dkoontz18@gmail.com",
"janese_young@yahoo.com",
"gabssstobsss@gmail.com",
"thelake02@sbcglobal.net",
"timmyboston01@gmail.com",
"carloscharters1996@gmail.com",
"djamila28@outlook.com",
"heathermaeb@gmail.com",
"canelo2080@gmail.com",
"pamsanta.ps@gmail.com",
"horeka.mash98@gmail.com",
"oeh@gondtc.com",
"ohmygod990227@hotmail.com",
"marieazme@yahoo.com",
"shirleyteuta@gmail.com",
"janetcoppedge@sbcglobal.net",
"dimashilov30@gmail.com",
"benavides.kam@gmail.com",
"sonyainsonora@yahoo.com",
"benl04123@outlook.com",
"cmculbreath@fedex.com",
"antmeb@gmail.com",
"jrlopez61@hotmail.com",
"jaimie.mudge@hotmail.com",
"dreamworld1980@secmail.pro",
"tinajones@sympatico.ca",
"nobby@secmail.pro",
"twistedsun@secmail.pro",
"slayermodsv3@gmail.com",
"beastmodsv1@gmail.com",
"prestonkonicek@gmail.com",
"fnbrleaksv2@gmail.com",
"fnbrleaks@gmail.com",
"pushingeverythingyt@gmail.com",
"rachelkonicek@gmail.com",
"vsfortune@hotmail.com",
"dannajoywhite@gmail.com",
"jensenjody@gmail.com",
"jenniferjbisschop@gmail.com",
"hkbergado@gmail.com",
"mummifiedbabies@secmail.pro"
OMMITTED
],
"analyticsIDs": null,
"bitcoinAddresses": [

@ -3,8 +3,11 @@
general:
# Run forever, check feeds once an hour.
daemon: False
sleep: 3600
daemon: True
sleep: 10
onion_validation: ([a-z2-7]{16,56}\.onion)
blacklist: pedo,xxx,infant,loli,porn,child,abuse,sex,drug,cocaine,dope,zoo,daddy,daughter,boy,girl,young,muder,cocks,year,old
interestingKeywords: t.me,feed,rss,xml,atom,dataleak,breach,blog,ransomware,source code,data breach
elasticsearch:
index: darkweb
port : 9200
@ -12,9 +15,13 @@ general:
sources:
# A few threat intel blogs to get you started!
- name: source-gist
module: gist
url: https://gist.github.com/search?l=Text&q=.onion
- name: simple-text-file
module: simplefile
filename: onion_master_list.txt
# - name: source-gist
# module: gist
# url: https://gist.github.com/search?l=Text&q=.onion
# - name: source-reddit
# module: reddit
@ -43,20 +50,23 @@ sources:
operators:
- name: onionscan-go
module: onionscan
binpath: /home/tony/go/bin/onionscan
socks5:
http: 'socks5h://127.0.0.1:9050'
https: 'socks5h://127.0.0.1:9050'
TorController:
port: 9051
password: Xk5QP2haFMh8Y8D1060F1D7xaWEFG
timeout: 300
retries: 2
screenshots_path: null
blacklist: pedo,xxx,infant,loli,porn,child,abuse,sex,drug,cocaine,dope,zoo,daddy,daughter,boy,girl,young,muder
interestingKeywords: t.me,feed,rss,xml,atom,dataleak,breach,blog,ransomware,source code,data breach
- name: simple-html
module: html
socks5:
http: 'socks5h://127.0.0.1:9050'
https: 'socks5h://127.0.0.1:9050'
TorController:
port: 9051
password: your-torcontroller-password-here
- name: simple-screenshot
module: screenshot
screenshots_path: null
- name: onionscan-go
module: onionscan
binpath: /home/tony/go/bin/onionscan
# - name: yara-rule
# module: yara

@ -1,131 +0,0 @@
import sys
import time
import traceback
import collections
from . import config
from . import dbhandler
from . import loghandler
class OnionManager:
"""ThreatIngestor main work logic.
Handles reading the config file, calling sources, maintaining state, and
sending artifacts to operators.
"""
def __init__(self, args):
# Load logger
log = loghandler.LoggerHandler(args.logLevel)
self.logger = log.start_logging()
# Load config
self.config = config.Config(args.configFile, self.logger)
# Load Elasticsearch.
try:
self.es = dbhandler.DbHandlerElasticSearch(
self.config.elasticsearch(),
self.logger)
except Exception as e:
# Error loading elasticsearch.
self.logger.error(e)
self.logger.debug(traceback.print_exc())
sys.exit(1)
# Instantiate plugins.
try:
self.logger.info("Initializing sources")
self.sources = {name: source(self.logger, **kwargs)
for name, source, kwargs in self.config.sources()}
self.logger.info("initializing operators")
self.operators = {name: operator(self.logger, **kwargs)
for name, operator, kwargs in self.config.operators()}
self.logger.info("initializing notifiers")
#self.notifiers = {name: operator(**kwargs)
# for name, operator, kwargs in self.config.notifiers()}
except Exception as e:
# Error loading elasticsearch.
self.logger.error(e)
self.logger.debug(traceback.print_exc())
sys.exit(1)
def run(self):
"""Run once, or forever, depending on config."""
if self.config.daemon():
selfl.logger.info("Running forever, in a loop")
self.run_forever()
else:
self.logger.info("Running once, to completion")
self.run_once()
def run_once(self):
"""Run each source once, passing artifacts to each operator."""
# Track some statistics about artifacts in a summary object.
summary = collections.Counter()
for source in self.sources:
# Run the source to collect artifacts.
self.logger.info(f"Running source '{source}'")
try:
onions = self.sources[source].run()
if onions:
self.logger.info(f'Found hidden links')
else:
self.logger.info('No links found')
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())
continue
# Process artifacts with each operator.
for operator in self.operators:
self.logger.info(f"Processing found onions with operator '{operator}'")
try:
doc = self.operators[operator].process(onions)
# Save the source state.
self.es.save(doc)
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())
continue
# # Record stats and update the summary.
# types = artifact_types(doc.get('interestingKeywords'))
# summary.update(types)
# for artifact_type in types:
# self.logger.info(f'types[artifact_type]')
# Log the summary.
self.logger.info(f"New artifacts: {dict(summary)}")
def run_forever(self):
"""Run forever, sleeping for the configured interval between each run."""
while True:
self.run_once()
self.logger.info(f"Sleeping for {self.config.sleep()} seconds")
time.sleep(self.config.sleep())
def artifact_types(artifact_list):
"""Return a dictionary with counts of each artifact type."""
types = {}
for artifact in artifact_list:
artifact_type = artifact.__class__.__name__.lower()
if artifact_type in types:
types[artifact_type] += 1
else:
types[artifact_type] = 1
return types

@ -1,50 +0,0 @@
"""OnionScraper
A Python3 application for indexing and scraping hidden services ElasticSearch
Installation:
This application assumes you have python3 and pip3 installed.
pip3 install -r requirements.txt
This software is provided subject to the MIT license stated below.
--------------------------------------------------
MIT License
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
--------------------------------------------------
"""
import argparse
from onionscraper import OnionManager
# Load arguments from user
parser = argparse.ArgumentParser(
prog='onionscraper',
description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-c', '--config',dest="configFile", required = True, help='Path to config file')
parser.add_argument("--log", dest="logLevel",default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the logging level, default is INFO")
args = parser.parse_args()
app = OnionManager(args)
app.run()

@ -1,170 +0,0 @@
import io
import importlib
import traceback
import yaml
from pathlib import Path
SOURCE = 'onionscraper.sources'
OPERATOR = 'onionscraper.operators'
INTERNAL_OPTIONS = [
'saved_state',
'module',
'credentials',
]
ARTIFACT_TYPES = 'artifact_types'
FILTER_STRING = 'filter'
ALLOWED_SOURCES = 'allowed_sources'
NAME = 'name'
class Config:
"""Config read/write operations, and convenience methods."""
def __init__(self, filename, logger):
"""Read a config file."""
self.logger = logger
self.filename = filename
with io.open(self.filename, 'r') as f:
try:
self.logger.info("Loading config file")
self.config = yaml.safe_load(f.read())
except yaml.error.YAMLError:
self.logger.error("YAML error in config")
@staticmethod
def _load_plugin(plugin_type, plugin):
"""Returns plugin class or raises an exception.
:raises: threatingestor.exceptions.PluginError
"""
try:
module = importlib.import_module('.'.join([plugin_type, plugin]))
return module.Plugin
except Exception as e:
print(e)
print(traceback.print_exc())
def daemon(self):
"""Returns boolean, are we daemonizing?"""
return self.config['general']['daemon']
def elasticsearch(self):
"""Returns elasticsaerch config"""
return self.config['general']['elasticsearch']
def sleep(self):
"""Returns number of seconds to sleep between iterations, if daemonizing."""
return self.config['general']['sleep']
# def onionscanner(self):
# """Returns onionscanner config dict"""
# screenshots = self.config['onionscanner'].pop('screenshots_path', None)
# if screenshots:
# self.config['onionscanner']['screenshots_path'] = Path(screenshots)
# else:
# self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots'
# blacklist = self.config['onionscanner'].pop('blacklist', None)
# if blacklist:
# self.config['onionscanner']['blacklist'] = blacklist.split(',')
# interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None)
# if interestingKeywords:
# self.config['onionscanner']['interestingKeywords'] = blacklist.split(',')
# return self.config['onionscanner']
def notifiers(self):
"""Returns notifiers config dictionary."""
return self.config.get('notifiers', {})
def logging(self):
"""Returns logging config dictionary."""
return self.config.get('logging', {})
def credentials(self, credential_name):
"""Return a dictionary with the specified credentials."""
for credential in self.config['credentials']:
for key, value in credential.items():
if key == NAME and value == credential_name:
return credential
return {}
def sources(self):
"""Return a list of (name, Source class, {kwargs}) tuples.
:raises: threatingestor.exceptions.PluginError
"""
sources = []
for source in self.config['sources']:
kwargs = {}
for key, value in source.items():
if key not in INTERNAL_OPTIONS:
kwargs[key] = value
elif key == 'credentials':
# Grab these named credentials
credential_name = value
for credential_key, credential_value in self.credentials(credential_name).items():
if credential_key != NAME:
kwargs[credential_key] = credential_value
# load and initialize the plugin
self.logger.info(f"Found source '{source[NAME]}'")
sources.append((source[NAME], self._load_plugin(SOURCE, source['module']), kwargs))
self.logger.info(f"Found {len(sources)} total sources")
return sources
def operators(self):
"""Return a list of (name, Operator class, {kwargs}) tuples.
:raises: threatingestor.exceptions.PluginError
"""
operators = []
for operator in self.config['operators']:
kwargs = {}
for key, value in operator.items():
if key not in INTERNAL_OPTIONS:
if key == ARTIFACT_TYPES:
# parse out special artifact_types option
artifact_types = []
for artifact in value:
try:
artifact_types.append(threatingestor.artifacts.STRING_MAP[artifact.lower().strip()])
except KeyError:
# ignore invalid artifact types
pass
kwargs[key] = artifact_types
elif key == FILTER_STRING:
# pass in special filter_string option
kwargs['filter_string'] = value
elif key == NAME:
# exclude name key from operator kwargs, since it's not used
pass
else:
kwargs[key] = value
elif key == 'credentials':
# Grab these named credentials
credential_name = value
for credential_key, credential_value in self.credentials(credential_name).items():
if credential_key != NAME:
kwargs[credential_key] = credential_value
# load and initialize the plugin
self.logger.info(f"Found operator '{operator[NAME]}'")
operators.append((operator[NAME], self._load_plugin(OPERATOR, operator['module']), kwargs))
self.logger.info(f"Found {len(operators)} total operators")
return operators

@ -1,774 +0,0 @@
import sys
import traceback
from elasticsearch import Elasticsearch, helpers
class DbHandlerElasticSearch:
def __init__(self, config, logger):
self.logger = logger
self.logger.info('Creating Elasticsearch mapping')
self.config = config
self.mapping = '''
{
"mappings": {
"_doc": {
"properties": {
"html": {
"type": "text"
},
"onionscan": {
"type": "nested",
"properties": {
"bitcoinDetected": {
"type": "boolean"
},
"bitcoinServices": {
"properties": {
"bitcoin": {
"properties": {
"detected": {
"type": "boolean"
},
"prototocolVersion": {
"type": "long"
},
"userAgent": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"bitcoin_test": {
"properties": {
"detected": {
"type": "boolean"
},
"prototocolVersion": {
"type": "long"
},
"userAgent": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"dogecoin": {
"properties": {
"detected": {
"type": "boolean"
},
"prototocolVersion": {
"type": "long"
},
"userAgent": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"litecoin": {
"properties": {
"detected": {
"type": "boolean"
},
"prototocolVersion": {
"type": "long"
},
"userAgent": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
},
"certificates": {
"type": "nested",
"properties": {
"AuthorityKeyId": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"BasicConstraintsValid": {
"type": "boolean"
},
"CRLDistributionPoints": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"DNSNames": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"ExtKeyUsage": {
"type": "long"
},
"Extensions": {
"properties": {
"Critical": {
"type": "boolean"
},
"Id": {
"type": "long"
},
"Value": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"IsCA": {
"type": "boolean"
},
"Issuer": {
"properties": {
"CommonName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Country": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Locality": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Names": {
"properties": {
"Type": {
"type": "long"
},
"Value": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"Organization": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"OrganizationalUnit": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Province": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"SerialNumber": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"IssuingCertificateURL": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"KeyUsage": {
"type": "long"
},
"MaxPathLen": {
"type": "long"
},
"MaxPathLenZero": {
"type": "boolean"
},
"NotAfter": {
"type": "date"
},
"NotBefore": {
"type": "date"
},
"OCSPServer": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"PermittedDNSDomainsCritical": {
"type": "boolean"
},
"PolicyIdentifiers": {
"type": "long"
},
"PublicKey": {
"properties": {
"E": {
"type": "text"
},
"N": {
"type": "text"
}
}
},
"PublicKeyAlgorithm": {
"type": "long"
},
"Raw": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"RawIssuer": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"RawSubject": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"RawSubjectPublicKeyInfo": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"RawTBSCertificate": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"SerialNumber": {
"type": "text"
},
"Signature": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"SignatureAlgorithm": {
"type": "long"
},
"Subject": {
"properties": {
"CommonName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Country": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Locality": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Names": {
"properties": {
"Type": {
"type": "long"
},
"Value": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"Organization": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"OrganizationalUnit": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Province": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"SerialNumber": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"SubjectKeyId": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Version": {
"type": "long"
}
}
},
"crawls": {
"type": "nested",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"dateScanned": {
"type": "date"
},
"f_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"ftpBanner": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"ftpDetected": {
"type": "boolean"
},
"ftpFingerprint": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"hiddenService": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"identifierReport": {
"properties": {
"analyticsIDs": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"bitcoinAddresses": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"emailAddresses": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"exifImages": {
"properties": {
"exifTags": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"value": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"location": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"foundApacheModStatus": {
"type": "boolean"
},
"linkedOnions": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"openDirectories": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"privateKeyDetected": {
"type": "boolean"
},
"serverVersion": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"ircDetected": {
"type": "boolean"
},
"lastAction": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"mongodbDetected": {
"type": "boolean"
},
"online": {
"type": "boolean"
},
"performedScans": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"pgpKeys": {
"properties": {
"armoredKey": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"fingerprint": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"identity": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"ricochetDetected": {
"type": "boolean"
},
"skynetDetected": {
"type": "boolean"
},
"smtpBanner": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"smtpDetected": {
"type": "boolean"
},
"smtpFingerprint": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"sshBanner": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"sshDetected": {
"type": "boolean"
},
"sshKey": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"timedOut": {
"type": "boolean"
},
"tlsDetected": {
"type": "boolean"
},
"vncDetected": {
"type": "boolean"
},
"webDetected": {
"type": "boolean"
},
"xmppDetected": {
"type": "boolean"
}
}
},
"screenshots": {
"type": "nested",
"properties": {
"dateScreenshoted": {
"type": "date"
},
"filename": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
}
'''
try:
self.es = Elasticsearch([{
'host':self.config['host'],
'port':self.config['port']}])
self.es.indices.create(
index=self.config['index'],
body=self.mapping,
ignore=400)
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.format_exc())
sys.exit(0)
def count(self):
self.es.indices.refresh(self.index)
status = self.es.count(index=self.index)
if status['_shards']['successful'] == 1:
self.logger.info('Successful')
self.logger.info('Count:%d',status['count'])
else:
self.logger.error(status)
def save(self, doc):
self.es.index(index=self.index,body=doc)
self.count()

@ -1,33 +0,0 @@
import os
import logging
from pathlib import Path
class LoggerHandler():
def __init__(self, level):
self.level = getattr(logging, level)
self.logger = logging.getLogger("OnionScraper")
self.logger.setLevel(self.level)
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(self.level)
# create file logging
logFile = Path(__file__).parents[1]
logging_path = os.path.join(logFile, "info.log")
fh = logging.FileHandler(logging_path)
# create formatter
formatter = logging.Formatter('[%(asctime)s] - %(name)s - %(levelname)s - %(message)s',datefmt='%a, %d %b %Y %H:%M:%S')
formatter_console = logging.Formatter('[%(asctime)s] - %(levelname)s - %(message)s',datefmt='%d %b %Y %H:%M:%S')
# add formatter to ch
ch.setFormatter(formatter_console)
fh.setFormatter(formatter)
# add ch to logger
self.logger.addHandler(ch) #added logging into console
self.logger.addHandler(fh) #added logging into file
def start_logging(self):
self.logger.info('Starting OnionScraper')
return self.logger

@ -1,78 +0,0 @@
import re
class Operator:
"""Base class for all Operator plugins.
Note: This is an abstract class. You must extend ``__init__`` and call
``super`` to ensure this class's constructor is called. You must override
``handle_artifact`` with the same signature. You may define additional
``handle_{artifact_type}`` methods as needed (see the threatkb operator for
an example) - these methods are purely convention, and are not required.
When adding additional methods to child classes, consider prefixing the
method name with an underscore to denote a ``_private_method``. Do not
override other existing methods from this class.
"""
def __init__(self, artifact_types=None, filter_string=None, allowed_sources=None):
"""Override this constructor in child classes.
The arguments above (artifact_types, filter_string, allowed_sources)
should be accepted explicity as above, in all child classes.
Additional arguments should be added: url, auth, etc, whatever is
needed to set up the object.
Each operator should default self.artifact_types to a list of Artifacts
supported by the plugin, and allow passing in artifact_types to
overwrite that default.
Example:
>>> self.artifact_types = artifact_types or [
... artifacts.IPAddress,
... artifacts.Domain,
... ]
It's recommended to call this __init__ method via super from all child
classes. Remember to do so *before* setting any default artifact_types.
"""
self.artifact_types = artifact_types or []
self.filter_string = filter_string or ''
self.allowed_sources = allowed_sources or []
def handle_onion(self, url):
"""Override with the same signature.
:param artifact: A single ``Artifact`` object.
:returns: None (always ignored)
"""
raise NotImplementedError()
def _artifact_is_allowed(self, artifact):
"""Returns True if this is allowed by this plugin's filters."""
# # Must be in allowed_types.
# if not any(isinstance(artifact, t) for t in self.artifact_types):
# return False
#
# # Must match the filter string.
# if not artifact.match(self.filter_string):
# return False
#
# # Must be in allowed_sources, if set.
# if self.allowed_sources and not any(
# [re.compile(p).search(artifact.source_name)
# for p in self.allowed_sources]):
# return False
#
return True
def process(self, onions):
"""Process all applicable onions."""
for onion in onions:
if self._artifact_is_allowed(onion.url):
self.handle_onion(onion)

@ -1,259 +0,0 @@
import re
import os
import sys
import json
import time
import random
import traceback
import subprocess
from uuid import uuid4
from pathlib import Path
from datetime import datetime as dt
from json.decoder import JSONDecodeError
from concurrent.futures import ProcessPoolExecutor
from threading import Timer
import requests
from stem.control import Controller
from stem import Signal
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from onionscraper.operators import Operator
class Plugin(Operator):
"""OnionScraper main work logic.
Handles reading the config file, calling sources, maintaining state and
sending artifacts to operators.
"""
def __init__(self, logger, **kwargs):
self.logger = logger
self.logger.info('Initializing OnionScanner')
screenshots = kwargs.pop('screenshots_path', None)
if screenshots:
self.screenshots = Path(screenshots)
else:
self.screenshots = Path(__file__).parents[1]/'screenshots'
self.onionscan = kwargs['binpath']
self.timeout = int(kwargs['timeout'])
self.proxy = kwargs['socks5']
self.torControl = kwargs['TorController']
self.retries = int(kwargs['retries'])
self.headers ={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'DNT': '1', 'Connection':
'keep-alive',
'Upgrade-Insecure-Requests': '1'}
blacklist = kwargs['blacklist'].split(',')
self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE)
keywords = kwargs['interestingKeywords'].split(',')
self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE)
self.session = self.get_tor_session()
def response(self, status, content, onion):
"""
status: success/failure
content: dict
onion: str
return: dict
"""
return {'status': status, 'data': content, 'onion': onion}
def parseDoc(self, data):
data['onionscan'].pop('simpleReport', None)
crawls = data['onionscan'].pop('crawls', None)
hiddenService = data['onionscan'].pop('hiddenService', None)
data['onionscan']['crawls'] = [*crawls]
data['hiddenService'] = hiddenService
for onion in crawls.keys():
print(onion)
#q.enqueue(self.crawl, onion)
#with open('test.json', 'w', encoding='utf-8') as f:
# json.dump(data, f, ensure_ascii=False, indent=4)
return data
def format_directory(self, directory):
d = dt.now()
year = str(d.year)
month = str(d.month)
# prefix month and day with "0" if it is only one digit
if len(month) < 2:
month = "0" + month
day = str(d.day)
if len(day) < 2:
day = "0" + day
save_path = directory/year/month/day
if not os.path.isdir(save_path):
self.logger.info("[*] Creating directory to save screenshots")
os.makedirs(save_path)
return save_path
def take_screenshot(self, save_path, onion):
binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver')
fp = webdriver.FirefoxProfile()
fp.set_preference('network.proxy.type', 1)
fp.set_preference('network.proxy.socks', '127.0.0.1')
fp.set_preference('network.proxy.socks_port', 9050)
fp.set_preference('network.proxy.socks_remote_dns', True)
options = Options()
options.headless = True
driver = webdriver.Firefox(
executable_path='/home/tony/Projects/OnionScraper/geckodriver',
options=options,
firefox_profile=fp)
url = 'http://' + onion
driver.get(url)
uid = str(uuid4()).split('-')[0]
filename = f"{onion}_screenshot_{uid}.png"
f_name = f"{save_path}/{filename}"
driver.save_screenshot(f_name)
driver.quit()
if os.path.isfile(f_name):
self.logger.info(f'[*] Screenshot was taken. {f_name}')
dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'
result = {'dateScreenshoted':dateScreenshoted,'filename':filename}
return self.response("success",result,onion)
else:
self.logger.error('[x] Unable to take screenshot')
return self.response("failure",None,onion)
def get_tor_session(self):
try:
s = requests.session()
s.proxies = self.proxy
s.headers.update(self.headers)
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
return s
# signal TOR for a new connection
def renew_connection(self):
with Controller.from_port(port = self.torControl['port']) as controller:
# Now we switch TOR identities to make sure we have a good connection
self.logger.info('Getting new Tor IP')
# authenticate to our local TOR controller
controller.authenticate(self.torControl['password'])
# send the signal for a new identity
controller.signal(Signal.NEWNYM)
# wait for the new identity to be initialized
time.sleep(controller.get_newnym_wait())
session = self.get_tor_session()
self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
def handle_timeout(self, process, onion):
#
# Handle a timeout from the onionscan process.
#
try:
# kill the onionscan process
process.kill()
self.logger.info("[!!!] Killed the onionscan process.")
except:
pass
self.renew_connection()
return
def run_sessions(self, onion):
retry = 0
result = None
while True:
try:
url = 'http://'+onion
self.logger.info(url)
content = self.session.get(url)
if content.status_code == 200:
result = content.json()
except JSONDecodeError as e:
self.logger.debug(f'JSONDecodeError {e}')
result = content.text
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
finally:
if result:
return self.response("success",result,onion)
else:
self.logger.info('[x] No results found retrying ...')
retry += 1
self.renew_connection()
if retry > self.retries:
self.logger.error('[x] Max retries exceeded')
return self.response("failure",None, onion)
def run_onionscan(self, onion):
self.logger.info("[*] Running onionscan on %s", onion)
# fire up onionscan
process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
# start the timer and let it run till timeout minutes
process_timer = Timer(300,self.handle_timeout,args=[process,onion])
process_timer.start()
# wait for the onion scan results
stdout = process.communicate()[0]
# we have received valid results so we can kill the timer
if process_timer.is_alive():
process_timer.cancel()
return self.response("success",json.loads(stdout),onion)
self.logger.info("[!!!] Process timed out for %s", onion)
return self.response("failure",None, onion)
def handle_onion(self, onion_tuple):
onion = onion_tuple.url
self.logger.info(f'Processing {onion} with onionscan')
try:
blacklist_URL = self.blacklist.search(onion)
if blacklist_URL:
self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}")
else:
self.logger.debug("[*] URL blacklist test: PASSED")
results = self.run_onionscan(onion)
if results['status'] == 'success' and results['data']['webDetected'] == 'true':
content = self.run_sessions(onion)
if content['status'] == 'success':
blacklist_CONTENT = self.blacklist.search(content['data'])
if blacklist_CONTENT:
self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}")
else:
self.logger.debug("[*] CONTENT blacklist test: PASSED")
screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion)
self.logger.info("Indexing!")
doc = {
'onionscan':json.loads(results['data']),
'html':content['data'],
'screenshots':screenshot['data'],
'interestingKeywords':self.interestingKeywords.findall(content['data'])
}
return self.parseDoc(doc)
else:
self.logger.info(f"[x] hidden service {onion} is not active")
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())
finally:
pass
#sys.exit(0)

@ -1,15 +0,0 @@
from onionscraper.operators import Operator
class Plugin(Operator):
"""Operator for output to flat CSV file."""
def __init__(self, filename, base_score):
"""CSV operator."""
self.filename = filename
#super(Plugin, self).__init__(artifact_types, filter_string, allowed_sources)
def handle_artifact(self, artifact):
"""Operate on a single artifact."""
pass

@ -1,41 +0,0 @@
from collections import namedtuple
class Source(object):
"""Base class for all Source plugins.
Note: This is an abstract class. You must override ``__init__`` and ``run``
in child classes. You should not override ``process_element``. When adding
additional methods to child classes, consider prefixing the method name
with an underscore to denote a ``_private_method``.
"""
def __init__(self, name, *args, **kwargs):
"""Override this constructor in child classes.
The first argument must always be ``name``.
Other argumentss should be url, auth, etc, whatever is needed to set
up the object.
"""
self.onion = namedtuple('onion', ['url','source','type'])
def run(self):
"""Run and return ``(saved_state, list(Artifact))``.
Override this method in child classes.
The method signature and return values must remain consistent.
The method should attempt to pick up where we left off using
``saved_state``, if supported. If ``saved_state`` is ``None``, you can
assume this is a first run. If state is maintained by the remote
resource (e.g. as it is with SQS), ``saved_state`` should always be
``None``.
"""
raise NotImplementedError()
def process_element(self, content, reference_link, include_nonobfuscated=False):
"""Take a single source content/url and return a list of Artifacts.
This is the main work block of Source plugins, which handles
IOC extraction and artifact creation.
:param content: String content to extract from.
:param reference_link: Reference link to attach to all artifacts.
:param include_nonobfuscated: Include non-defanged URLs in output?
"""
logger.debug(f"Processing in source '{self.name}'")

@ -1,153 +0,0 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
__author__ = 'Andrey Glauzer'
__license__ = "MIT"
__version__ = "1.0.1"
__maintainer__ = "Andrey Glauzer"
__status__ = "Development"
import requests
import json
import re
import re
import urllib.parse
from random import choice
import time
from bs4 import BeautifulSoup
from onionscraper.sources import Source
class Plugin(Source):
def __init__(self, logger, name, url):
self.logger = logger
self.name = name
self.url = url
self.desktop_agents = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
super().__init__(self)
def run(self):
self.logger.info('Starting Gist Scraper')
self.cookies()
self.pagination()
self.scraping()
return self.raw()
@property
def random_headers(self):
return {
'User-Agent': choice(self.desktop_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
def cookies(self):
self.logger.info('Setting GIST cookies')
with requests.Session() as self.session:
self.headers = self.random_headers
request = self.session.get(self.url, headers=self.headers)
if request.status_code == 200:
pass
else:
self.logger.error('No Response from GIST')
def pagination(self):
request = self.session.get(
f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)
self.soup = BeautifulSoup(request.content, features="lxml")
pages = []
self.urls = [self.url]
try:
for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):
pages.append(pagination.get_text())
except:
pages = False
if pages:
cont = 2
while cont <= 1: # int(pages[-2]):
cont += 1
full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"
self.urls.append(full_url)
def scraping(self):
url = []
for inurl in self.urls:
self.logger.info(f"Connecting to {inurl}")
time.sleep(5)
request = self.session.get(inurl, headers=self.headers)
if request.status_code == 200:
soup = BeautifulSoup(request.content, features="lxml")
for code in soup.findAll('div', {'class': 'gist-snippet'}):
if '.onion' in code.get_text().lower():
for raw in code.findAll('a', {'class': 'link-overlay'}):
try:
url.append(raw['href'])
except:
pass
self.urls_raw = []
for get in url:
self.logger.info(f"Connecting to {get}")
time.sleep(5)
try:
request = self.session.get(get, headers=self.headers)
if request.status_code == 200:
soup = BeautifulSoup(request.content, features="lxml")
for raw in soup.findAll('a', {'class': 'btn btn-sm'}):
try:
gist_url = f"https://gist.githubusercontent.com{raw['href']}"
self.urls_raw.append(gist_url)
except:
pass
except(requests.exceptions.ConnectionError,
requests.exceptions.ChunkedEncodingError,
requests.exceptions.ReadTimeout,
requests.exceptions.InvalidURL) as e:
self.logger.error(
f"I was unable to connect to the url, because an error occurred.\n{e}")
pass
def raw(self):
self.logger.info('Performing replaces and regex. WAIT...')
itens = []
onions = []
for raw in self.urls_raw:
if '.txt' in raw.lower() \
or '.csv' in raw.lower():
time.sleep(5)
request = self.session.get(raw, headers=self.headers)
self.soup = BeautifulSoup(request.content, features="lxml")
for pre in self.soup.findAll('body'):
list = pre.get_text().split('\n')
itens.extend(list)
regex = re.compile(
"[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
for lines in itens:
rurls = lines \
.replace('\xad', '') \
.replace('\n', '') \
.replace("http://", '') \
.replace("https://", '') \
.replace("www.", "")
url = regex.match(rurls)
if url is not None:
onions.append(self.onion(url=url.group(), source='gist', type='domain'))
return onions

@ -5,8 +5,6 @@ click==7.1.2
elasticsearch==7.8.0
idna==2.10
lxml==4.5.1
# Editable Git install with no remote (OnionScraper==1.0.0)
-e /home/tony/Projects/OnionScraper
PySocks==1.7.1
PyYAML==5.3.1
requests==2.24.0

@ -8,14 +8,14 @@ def readme_file_contents():
setup(
name='OnionScraper',
name='OnionIngestor',
version='1.0.0',
description='Python app to scraper and index hidden websites',
long_description=readme_file_contents(),
author='dan',
author_email='test@google.com',
license='MIT',
packages=['onionscraper'],
packages=['onioningestor'],
zip_safe=False,
install_requires=[]
)

Loading…
Cancel
Save