Merge pull request #1 from 2O4/master

formating, and fix indentation on function! Merged
pull/6/head
danieleperera 4 years ago committed by GitHub
commit ca297ea465
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

4
.gitignore vendored

@ -3,10 +3,12 @@ webui
templates templates
OnionIngestor.egg-info OnionIngestor.egg-info
screenshots screenshots
dump.rdb dump.rdb
onionscandb onionscandb
config.yml config.yml
*.log *.log
*.pyc *.pyc
__pycache__ __pycache__
venv venv
.vscode
build

@ -6,42 +6,41 @@ import yaml
from pathlib import Path from pathlib import Path
SOURCE = 'onioningestor.sources'
OPERATOR = 'onioningestor.operators'
SOURCE = "onioningestor.sources"
OPERATOR = "onioningestor.operators"
INTERNAL_OPTIONS = [ INTERNAL_OPTIONS = [
'saved_state', "saved_state",
'module', "module",
'credentials', "credentials",
] ]
ARTIFACT_TYPES = "artifact_types"
ARTIFACT_TYPES = 'artifact_types' FILTER_STRING = "filter"
FILTER_STRING = 'filter' ALLOWED_SOURCES = "allowed_sources"
ALLOWED_SOURCES = 'allowed_sources' NAME = "name"
NAME = 'name'
class Config: class Config:
"""Config read/write operations, and convenience methods.""" """Config read/write operations, and convenience methods."""
def __init__(self, filename, logger): def __init__(self, filename, logger):
"""Read a config file.""" """Read a config file."""
self.logger = logger self.logger = logger
self.filename = filename self.filename = filename
with io.open(self.filename, 'r') as f: with io.open(self.filename, "r") as f:
try: try:
self.logger.info("Loading config file") self.logger.info("Loading config file")
self.config = yaml.safe_load(f.read()) self.config = yaml.safe_load(f.read())
except yaml.error.YAMLError: except yaml.error.YAMLError:
self.logger.error("YAML error in config") self.logger.error("YAML error in config")
@staticmethod @staticmethod
def _load_plugin(plugin_type, plugin): def _load_plugin(plugin_type, plugin):
"""Returns plugin class or raises an exception. """Returns plugin class or raises an exception.
:raises: threatingestor.exceptions.PluginError :raises: threatingestor.exceptions.PluginError
""" """
try: try:
module = importlib.import_module('.'.join([plugin_type, plugin])) module = importlib.import_module(".".join([plugin_type, plugin]))
return module.Plugin return module.Plugin
except Exception as e: except Exception as e:
print(e) print(e)
@ -49,89 +48,86 @@ class Config:
def daemon(self): def daemon(self):
"""Returns boolean, are we daemonizing?""" """Returns boolean, are we daemonizing?"""
return self.config['general']['daemon'] return self.config["general"]["daemon"]
def elasticsearch(self): def elasticsearch(self):
"""Returns elasticsaerch config""" """Returns elasticsaerch config"""
return self.config['general']['elasticsearch'] return self.config["general"]["elasticsearch"]
def sleep(self): def sleep(self):
"""Returns number of seconds to sleep between iterations, if daemonizing.""" """Returns number of seconds to sleep between iterations, if daemonizing."""
return self.config['general']['sleep'] return self.config["general"]["sleep"]
def blacklist(self): def blacklist(self):
return self.config['general']['blacklist'].split(',') return self.config["general"]["blacklist"].split(",")
# def onionscanner(self): # def onionscanner(self):
# """Returns onionscanner config dict""" # """Returns onionscanner config dict"""
# screenshots = self.config['onionscanner'].pop('screenshots_path', None) # screenshots = self.config['onionscanner'].pop('screenshots_path', None)
# if screenshots: # if screenshots:
# self.config['onionscanner']['screenshots_path'] = Path(screenshots) # self.config['onionscanner']['screenshots_path'] = Path(screenshots)
# else: # else:
# self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots' # self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots'
# blacklist = self.config['onionscanner'].pop('blacklist', None) # blacklist = self.config['onionscanner'].pop('blacklist', None)
# if blacklist: # if blacklist:
# self.config['onionscanner']['blacklist'] = blacklist.split(',') # self.config['onionscanner']['blacklist'] = blacklist.split(',')
# interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None) # interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None)
# if interestingKeywords: # if interestingKeywords:
# self.config['onionscanner']['interestingKeywords'] = blacklist.split(',') # self.config['onionscanner']['interestingKeywords'] = blacklist.split(',')
# return self.config['onionscanner'] # return self.config['onionscanner']
def notifiers(self): def notifiers(self):
"""Returns notifiers config dictionary.""" """Returns notifiers config dictionary."""
return self.config.get('notifiers', {}) return self.config.get("notifiers", {})
def logging(self): def logging(self):
"""Returns logging config dictionary.""" """Returns logging config dictionary."""
return self.config.get('logging', {}) return self.config.get("logging", {})
def credentials(self, credential_name): def credentials(self, credential_name):
"""Return a dictionary with the specified credentials.""" """Return a dictionary with the specified credentials."""
for credential in self.config['credentials']: for credential in self.config["credentials"]:
for key, value in credential.items(): for key, value in credential.items():
if key == NAME and value == credential_name: if key == NAME and value == credential_name:
return credential return credential
return {} return {}
def sources(self): def sources(self):
"""Return a list of (name, Source class, {kwargs}) tuples. """Return a list of (name, Source class, {kwargs}) tuples.
:raises: threatingestor.exceptions.PluginError :raises: threatingestor.exceptions.PluginError
""" """
sources = [] sources = []
for source in self.config['sources']: for source in self.config["sources"]:
kwargs = {} kwargs = {}
for key, value in source.items(): for key, value in source.items():
if key not in INTERNAL_OPTIONS: if key not in INTERNAL_OPTIONS:
kwargs[key] = value kwargs[key] = value
elif key == 'credentials': elif key == "credentials":
# Grab these named credentials # Grab these named credentials
credential_name = value credential_name = value
for credential_key, credential_value in self.credentials(credential_name).items(): for credential_key, credential_value in self.credentials(
credential_name
).items():
if credential_key != NAME: if credential_key != NAME:
kwargs[credential_key] = credential_value kwargs[credential_key] = credential_value
# load and initialize the plugin # load and initialize the plugin
self.logger.info(f"Found source '{source[NAME]}'") self.logger.info(f"Found source '{source[NAME]}'")
sources.append((source[NAME], self._load_plugin(SOURCE, source['module']), kwargs)) sources.append(
(source[NAME], self._load_plugin(SOURCE, source["module"]), kwargs)
)
self.logger.info(f"Found {len(sources)} total sources") self.logger.info(f"Found {len(sources)} total sources")
return sources return sources
def operators(self): def operators(self):
"""Return a list of (name, Operator class, {kwargs}) tuples. """Return a list of (name, Operator class, {kwargs}) tuples.
:raises: threatingestor.exceptions.PluginError :raises: threatingestor.exceptions.PluginError
""" """
operators = [] operators = []
for operator in self.config['operators']: for operator in self.config["operators"]:
kwargs = {} kwargs = {}
for key, value in operator.items(): for key, value in operator.items():
if key not in INTERNAL_OPTIONS: if key not in INTERNAL_OPTIONS:
@ -140,7 +136,11 @@ class Config:
artifact_types = [] artifact_types = []
for artifact in value: for artifact in value:
try: try:
artifact_types.append(threatingestor.artifacts.STRING_MAP[artifact.lower().strip()]) artifact_types.append(
threatingestor.artifacts.STRING_MAP[
artifact.lower().strip()
]
)
except KeyError: except KeyError:
# ignore invalid artifact types # ignore invalid artifact types
pass pass
@ -148,7 +148,7 @@ class Config:
elif key == FILTER_STRING: elif key == FILTER_STRING:
# pass in special filter_string option # pass in special filter_string option
kwargs['filter_string'] = value kwargs["filter_string"] = value
elif key == NAME: elif key == NAME:
# exclude name key from operator kwargs, since it's not used # exclude name key from operator kwargs, since it's not used
@ -157,16 +157,24 @@ class Config:
else: else:
kwargs[key] = value kwargs[key] = value
elif key == 'credentials': elif key == "credentials":
# Grab these named credentials # Grab these named credentials
credential_name = value credential_name = value
for credential_key, credential_value in self.credentials(credential_name).items(): for credential_key, credential_value in self.credentials(
credential_name
).items():
if credential_key != NAME: if credential_key != NAME:
kwargs[credential_key] = credential_value kwargs[credential_key] = credential_value
# load and initialize the plugin # load and initialize the plugin
self.logger.info(f"Found operator '{operator[NAME]}'") self.logger.info(f"Found operator '{operator[NAME]}'")
operators.append((operator[NAME], self._load_plugin(OPERATOR, operator['module']), kwargs)) operators.append(
(
operator[NAME],
self._load_plugin(OPERATOR, operator["module"]),
kwargs,
)
)
self.logger.info(f"Found {len(operators)} total operators") self.logger.info(f"Found {len(operators)} total operators")
return operators return operators

@ -8,7 +8,7 @@ class DbHandlerElasticSearch:
self.logger = logger self.logger = logger
self.logger.info('Creating Elasticsearch mapping') self.logger.info('Creating Elasticsearch mapping')
self.config = config self.config = config
self.mapping = ''' self.mapping = """
{ {
"mappings": { "mappings": {
"_doc": { "_doc": {
@ -18,7 +18,7 @@ class DbHandlerElasticSearch:
}, },
"blacklist": { "blacklist": {
"type": "keyword" "type": "keyword"
}, },
"monitor": { "monitor": {
"type": "boolean", "type": "boolean",
"null_value": "false" "null_value": "false"
@ -50,7 +50,7 @@ class DbHandlerElasticSearch:
} }
} }
} }
''' """
self.index = self.config['index'] self.index = self.config['index']
try: try:
self.es = Elasticsearch([{ self.es = Elasticsearch([{
@ -87,4 +87,3 @@ class DbHandlerElasticSearch:
status = self.es.index(index=self.index,body=data) status = self.es.index(index=self.index,body=data)
self.count() self.count()
return status return status

@ -2,6 +2,7 @@ import os
import logging import logging
from pathlib import Path from pathlib import Path
class LoggerHandler(): class LoggerHandler():
def __init__(self, level): def __init__(self, level):
self.level = getattr(logging, level) self.level = getattr(logging, level)
@ -30,4 +31,3 @@ class LoggerHandler():
def start_logging(self): def start_logging(self):
self.logger.info('Starting OnionScraper') self.logger.info('Starting OnionScraper')
return self.logger return self.logger

@ -19,6 +19,7 @@ class Operator:
method name with an underscore to denote a ``_private_method``. Do not method name with an underscore to denote a ``_private_method``. Do not
override other existing methods from this class. override other existing methods from this class.
""" """
def __init__(self, logger, elasticsearch, allowed_sources=None): def __init__(self, logger, elasticsearch, allowed_sources=None):
"""Override this constructor in child classes. """Override this constructor in child classes.
@ -87,7 +88,7 @@ class Operator:
def collect(self, onions): def collect(self, onions):
for onion in onions: for onion in onions:
self.logger.info(f'thread function processing {onion}') self.logger.info(f'thread function processing {onion}')
# Add link to database # Add link to database
db = self.es.save({ db = self.es.save({
'hiddenService':onion.url, 'hiddenService':onion.url,
'monitor':'false', 'monitor':'false',

@ -24,24 +24,27 @@ class Plugin(Operator):
def __init__(self, logger, elasticsearch, allowed_sources, **kwargs): def __init__(self, logger, elasticsearch, allowed_sources, **kwargs):
super(Plugin, self).__init__(logger, elasticsearch, allowed_sources) super(Plugin, self).__init__(logger, elasticsearch, allowed_sources)
self.plugin_name = 'simple-html' self.plugin_name = "simple-html"
self.logger.info(f"Initializing {self.plugin_name}") self.logger.info(f"Initializing {self.plugin_name}")
self.timeout = int(kwargs['timeout']) self.timeout = int(kwargs["timeout"])
self.retries = int(kwargs['retries']) self.retries = int(kwargs["retries"])
interesting = kwargs['interestingKeywords'].split(',') interesting = kwargs["interestingKeywords"].split(",")
self.interesting = re.compile('|'.join([re.escape(word) for word in interesting]), re.IGNORECASE) self.interesting = re.compile(
"|".join([re.escape(word) for word in interesting]), re.IGNORECASE
self.proxy = kwargs['socks5'] )
self.torControl = kwargs['TorController']
self.headers ={ self.proxy = kwargs["socks5"]
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0', self.torControl = kwargs["TorController"]
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', self.headers = {
'Accept-Language':'en-US,en;q=0.5', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
'DNT': '1', 'Connection': "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'keep-alive', "Accept-Language": "en-US,en;q=0.5",
'Upgrade-Insecure-Requests': '1'} "DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
def get_tor_session(self): def get_tor_session(self):
try: try:
@ -54,69 +57,84 @@ class Plugin(Operator):
return s return s
def renew_connection(self): def renew_connection(self):
with Controller.from_port(port = self.torControl['port']) as controller: with Controller.from_port(port=self.torControl["port"]) as controller:
# Now we switch TOR identities to make sure we have a good connection # Now we switch TOR identities to make sure we have a good connection
self.logger.info('Getting new Tor IP') self.logger.info("Getting new Tor IP")
# authenticate to our local TOR controller # authenticate to our local TOR controller
controller.authenticate(self.torControl['password']) controller.authenticate(self.torControl["password"])
# send the signal for a new identity # send the signal for a new identity
controller.signal(Signal.NEWNYM) controller.signal(Signal.NEWNYM)
# wait for the new identity to be initialized # wait for the new identity to be initialized
time.sleep(controller.get_newnym_wait()) time.sleep(controller.get_newnym_wait())
session = self.get_tor_session() session = self.get_tor_session()
self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}") self.logger.info(
f"IP is {session.get('http://httpbin.org/ip').json()['origin']}"
)
def run_sessions(self, onion): def run_sessions(self, onion):
retry = 0 retry = 0
result = None result = None
while True: while True:
try: try:
url = 'http://'+onion url = "http://" + onion
self.logger.info(url) self.logger.info(url)
content = self.get_tor_session().get(url) content = self.get_tor_session().get(url)
if content.status_code == 200: if content.status_code == 200:
result = content.text result = content.text
if result: if result:
html = BeautifulSoup(result,features="lxml") html = BeautifulSoup(result, features="lxml")
# testing hardcorded filepath # testing hardcorded filepath
with open("/home/tony/Projects/OnionScraper_v2/onion_master_list.txt", "w") as fp: with open(
for onion in re.findall('([a-z2-7]{16,56}\.onion)',result): "/home/tony/Projects/OnionScraper_v2/onion_master_list.txt",
fp.write("%s\n" % onion) "w",
if html: ) as fp:
index = { for onion in re.findall("([a-z2-7]{16,56}\.onion)", result):
'HTML':result, fp.write("%s\n" % onion)
'title':html.title.text, if html:
'language':detect(html.text), index = {
'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z', "HTML": result,
'status':'success', "title": html.title.text,
'interestingKeywords':list(set(self.interesting.findall(result))) "language": detect(html.text),
} "date-crawled": dt.utcnow().strftime(
else: "%Y-%m-%dT%H:%M:%S.%f"
index = { )
'HTML':result, + "Z",
'title': None, "status": "success",
'language': None, "interestingKeywords": list(
'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z', set(self.interesting.findall(result))
'status':'success', ),
'interestingKeywords':list(set(self.interesting.findall(result))) }
} else:
return self.response(index, onion, self.plugin_name) index = {
except requests.exceptions.ConnectionError as connection_error: "HTML": result,
self.logger.error(f'Failed connecting to http://{url}') "title": None,
self.logger.debug(connection_error) "language": None,
except Exception as e: "date-crawled": dt.utcnow().strftime(
self.logger.error(e) "%Y-%m-%dT%H:%M:%S.%f"
self.logger.debug(traceback.print_exc()) )
+ "Z",
self.logger.info('[x] No results found retrying ...') "status": "success",
retry += 1 "interestingKeywords": list(
self.renew_connection() set(self.interesting.findall(result))
if retry > self.retries: ),
self.logger.error('[x] Max retries exceeded') }
return self.response({'status':"failure"}, onion, self.plugin_name) return self.response(index, onion, self.plugin_name)
except requests.exceptions.ConnectionError as connection_error:
self.logger.error(f"Failed connecting to http://{url}")
self.logger.debug(connection_error)
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
self.logger.info("[x] No results found retrying ...")
retry += 1
self.renew_connection()
if retry > self.retries:
self.logger.error("[x] Max retries exceeded")
return self.response({"status": "failure"}, onion, self.plugin_name)
def handle_onion(self, db, onion): def handle_onion(self, db, onion):
content = self.run_sessions(onion) content = self.run_sessions(onion)
if content[self.plugin_name]['status'] == 'success': if content[self.plugin_name]["status"] == "success":
if self._onion_is_allowed(content, db, 'HTML'): if self._onion_is_allowed(content, db, "HTML"):
self.es.update(db['_id'], content) self.es.update(db["_id"], content)

@ -24,6 +24,7 @@ from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from onionscraper.operators import Operator from onionscraper.operators import Operator
class Plugin(Operator): class Plugin(Operator):
"""OnionScraper main work logic. """OnionScraper main work logic.
@ -49,8 +50,8 @@ class Plugin(Operator):
'Accept-Language':'en-US,en;q=0.5', 'Accept-Language':'en-US,en;q=0.5',
'DNT': '1', 'Connection': 'DNT': '1', 'Connection':
'keep-alive', 'keep-alive',
'Upgrade-Insecure-Requests': '1'} 'Upgrade-Insecure-Requests': '1'
}
blacklist = kwargs['blacklist'].split(',') blacklist = kwargs['blacklist'].split(',')
self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE) self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE)
@ -110,9 +111,10 @@ class Plugin(Operator):
options = Options() options = Options()
options.headless = True options.headless = True
driver = webdriver.Firefox( driver = webdriver.Firefox(
executable_path='/home/tony/Projects/OnionScraper/geckodriver', executable_path='/home/tony/Projects/OnionScraper/geckodriver',
options=options, options=options,
firefox_profile=fp) firefox_profile=fp
)
url = 'http://' + onion url = 'http://' + onion
driver.get(url) driver.get(url)
uid = str(uuid4()).split('-')[0] uid = str(uuid4()).split('-')[0]
@ -131,8 +133,6 @@ class Plugin(Operator):
self.logger.error('[x] Unable to take screenshot') self.logger.error('[x] Unable to take screenshot')
return self.response("failure",None,onion) return self.response("failure",None,onion)
def get_tor_session(self): def get_tor_session(self):
try: try:
s = requests.session() s = requests.session()
@ -172,31 +172,31 @@ class Plugin(Operator):
return return
def run_sessions(self, onion): def run_sessions(self, onion):
retry = 0 retry = 0
result = None result = None
while True: while True:
try: try:
url = 'http://'+onion url = 'http://'+onion
self.logger.info(url) self.logger.info(url)
content = self.session.get(url) content = self.session.get(url)
if content.status_code == 200: if content.status_code == 200:
result = content.json() result = content.json()
except JSONDecodeError as e: except JSONDecodeError as e:
self.logger.debug(f'JSONDecodeError {e}') self.logger.debug(f'JSONDecodeError {e}')
result = content.text result = content.text
except Exception as e: except Exception as e:
self.logger.error(e) self.logger.error(e)
self.logger.debug(traceback.print_exc()) self.logger.debug(traceback.print_exc())
finally: finally:
if result: if result:
return self.response("success",result,onion) return self.response("success",result,onion)
else: else:
self.logger.info('[x] No results found retrying ...') self.logger.info('[x] No results found retrying ...')
retry += 1 retry += 1
self.renew_connection() self.renew_connection()
if retry > self.retries: if retry > self.retries:
self.logger.error('[x] Max retries exceeded') self.logger.error('[x] Max retries exceeded')
return self.response("failure",None, onion) return self.response("failure",None, onion)
def run_onionscan(self, onion): def run_onionscan(self, onion):
self.logger.info("[*] Running onionscan on %s", onion) self.logger.info("[*] Running onionscan on %s", onion)
@ -259,6 +259,3 @@ class Plugin(Operator):
finally: finally:
pass pass
#sys.exit(0) #sys.exit(0)

@ -1,6 +1,7 @@
from onionscraper.operators import Operator from onionscraper.operators import Operator
class Plugin(Operator): class Plugin(Operator):
"""Operator for output to flat CSV file.""" """Operator for output to flat CSV file."""
def __init__(self, filename, base_score): def __init__(self, filename, base_score):

Loading…
Cancel
Save