formating

pull/1/head
2O4 4 years ago
parent 4779d6952a
commit 7f026c8a4c

4
.gitignore vendored

@ -3,10 +3,12 @@ webui
templates
OnionIngestor.egg-info
screenshots
dump.rdb
dump.rdb
onionscandb
config.yml
*.log
*.pyc
__pycache__
venv
.vscode
build

@ -6,42 +6,41 @@ import yaml
from pathlib import Path
SOURCE = 'onioningestor.sources'
OPERATOR = 'onioningestor.operators'
SOURCE = "onioningestor.sources"
OPERATOR = "onioningestor.operators"
INTERNAL_OPTIONS = [
'saved_state',
'module',
'credentials',
"saved_state",
"module",
"credentials",
]
ARTIFACT_TYPES = 'artifact_types'
FILTER_STRING = 'filter'
ALLOWED_SOURCES = 'allowed_sources'
NAME = 'name'
ARTIFACT_TYPES = "artifact_types"
FILTER_STRING = "filter"
ALLOWED_SOURCES = "allowed_sources"
NAME = "name"
class Config:
"""Config read/write operations, and convenience methods."""
def __init__(self, filename, logger):
"""Read a config file."""
self.logger = logger
self.filename = filename
with io.open(self.filename, 'r') as f:
with io.open(self.filename, "r") as f:
try:
self.logger.info("Loading config file")
self.config = yaml.safe_load(f.read())
except yaml.error.YAMLError:
self.logger.error("YAML error in config")
@staticmethod
def _load_plugin(plugin_type, plugin):
"""Returns plugin class or raises an exception.
:raises: threatingestor.exceptions.PluginError
"""
try:
module = importlib.import_module('.'.join([plugin_type, plugin]))
module = importlib.import_module(".".join([plugin_type, plugin]))
return module.Plugin
except Exception as e:
print(e)
@ -49,89 +48,86 @@ class Config:
def daemon(self):
"""Returns boolean, are we daemonizing?"""
return self.config['general']['daemon']
return self.config["general"]["daemon"]
def elasticsearch(self):
"""Returns elasticsaerch config"""
return self.config['general']['elasticsearch']
return self.config["general"]["elasticsearch"]
def sleep(self):
"""Returns number of seconds to sleep between iterations, if daemonizing."""
return self.config['general']['sleep']
return self.config["general"]["sleep"]
def blacklist(self):
return self.config['general']['blacklist'].split(',')
# def onionscanner(self):
# """Returns onionscanner config dict"""
# screenshots = self.config['onionscanner'].pop('screenshots_path', None)
# if screenshots:
# self.config['onionscanner']['screenshots_path'] = Path(screenshots)
# else:
# self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots'
# blacklist = self.config['onionscanner'].pop('blacklist', None)
# if blacklist:
# self.config['onionscanner']['blacklist'] = blacklist.split(',')
# interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None)
# if interestingKeywords:
# self.config['onionscanner']['interestingKeywords'] = blacklist.split(',')
# return self.config['onionscanner']
return self.config["general"]["blacklist"].split(",")
# def onionscanner(self):
# """Returns onionscanner config dict"""
# screenshots = self.config['onionscanner'].pop('screenshots_path', None)
# if screenshots:
# self.config['onionscanner']['screenshots_path'] = Path(screenshots)
# else:
# self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots'
# blacklist = self.config['onionscanner'].pop('blacklist', None)
# if blacklist:
# self.config['onionscanner']['blacklist'] = blacklist.split(',')
# interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None)
# if interestingKeywords:
# self.config['onionscanner']['interestingKeywords'] = blacklist.split(',')
# return self.config['onionscanner']
def notifiers(self):
"""Returns notifiers config dictionary."""
return self.config.get('notifiers', {})
return self.config.get("notifiers", {})
def logging(self):
"""Returns logging config dictionary."""
return self.config.get('logging', {})
return self.config.get("logging", {})
def credentials(self, credential_name):
"""Return a dictionary with the specified credentials."""
for credential in self.config['credentials']:
for credential in self.config["credentials"]:
for key, value in credential.items():
if key == NAME and value == credential_name:
return credential
return {}
def sources(self):
"""Return a list of (name, Source class, {kwargs}) tuples.
:raises: threatingestor.exceptions.PluginError
"""
sources = []
for source in self.config['sources']:
for source in self.config["sources"]:
kwargs = {}
for key, value in source.items():
if key not in INTERNAL_OPTIONS:
kwargs[key] = value
elif key == 'credentials':
elif key == "credentials":
# Grab these named credentials
credential_name = value
for credential_key, credential_value in self.credentials(credential_name).items():
for credential_key, credential_value in self.credentials(
credential_name
).items():
if credential_key != NAME:
kwargs[credential_key] = credential_value
# load and initialize the plugin
self.logger.info(f"Found source '{source[NAME]}'")
sources.append((source[NAME], self._load_plugin(SOURCE, source['module']), kwargs))
sources.append(
(source[NAME], self._load_plugin(SOURCE, source["module"]), kwargs)
)
self.logger.info(f"Found {len(sources)} total sources")
return sources
def operators(self):
"""Return a list of (name, Operator class, {kwargs}) tuples.
:raises: threatingestor.exceptions.PluginError
"""
operators = []
for operator in self.config['operators']:
for operator in self.config["operators"]:
kwargs = {}
for key, value in operator.items():
if key not in INTERNAL_OPTIONS:
@ -140,7 +136,11 @@ class Config:
artifact_types = []
for artifact in value:
try:
artifact_types.append(threatingestor.artifacts.STRING_MAP[artifact.lower().strip()])
artifact_types.append(
threatingestor.artifacts.STRING_MAP[
artifact.lower().strip()
]
)
except KeyError:
# ignore invalid artifact types
pass
@ -148,7 +148,7 @@ class Config:
elif key == FILTER_STRING:
# pass in special filter_string option
kwargs['filter_string'] = value
kwargs["filter_string"] = value
elif key == NAME:
# exclude name key from operator kwargs, since it's not used
@ -157,16 +157,24 @@ class Config:
else:
kwargs[key] = value
elif key == 'credentials':
elif key == "credentials":
# Grab these named credentials
credential_name = value
for credential_key, credential_value in self.credentials(credential_name).items():
for credential_key, credential_value in self.credentials(
credential_name
).items():
if credential_key != NAME:
kwargs[credential_key] = credential_value
# load and initialize the plugin
self.logger.info(f"Found operator '{operator[NAME]}'")
operators.append((operator[NAME], self._load_plugin(OPERATOR, operator['module']), kwargs))
operators.append(
(
operator[NAME],
self._load_plugin(OPERATOR, operator["module"]),
kwargs,
)
)
self.logger.info(f"Found {len(operators)} total operators")
return operators

@ -8,7 +8,7 @@ class DbHandlerElasticSearch:
self.logger = logger
self.logger.info('Creating Elasticsearch mapping')
self.config = config
self.mapping = '''
self.mapping = """
{
"mappings": {
"_doc": {
@ -18,7 +18,7 @@ class DbHandlerElasticSearch:
},
"blacklist": {
"type": "keyword"
},
},
"monitor": {
"type": "boolean",
"null_value": "false"
@ -50,7 +50,7 @@ class DbHandlerElasticSearch:
}
}
}
'''
"""
self.index = self.config['index']
try:
self.es = Elasticsearch([{
@ -87,4 +87,3 @@ class DbHandlerElasticSearch:
status = self.es.index(index=self.index,body=data)
self.count()
return status

@ -2,6 +2,7 @@ import os
import logging
from pathlib import Path
class LoggerHandler():
def __init__(self, level):
self.level = getattr(logging, level)
@ -30,4 +31,3 @@ class LoggerHandler():
def start_logging(self):
self.logger.info('Starting OnionScraper')
return self.logger

@ -19,6 +19,7 @@ class Operator:
method name with an underscore to denote a ``_private_method``. Do not
override other existing methods from this class.
"""
def __init__(self, logger, elasticsearch, allowed_sources=None):
"""Override this constructor in child classes.
@ -87,7 +88,7 @@ class Operator:
def collect(self, onions):
for onion in onions:
self.logger.info(f'thread function processing {onion}')
# Add link to database
# Add link to database
db = self.es.save({
'hiddenService':onion.url,
'monitor':'false',

@ -24,24 +24,27 @@ class Plugin(Operator):
def __init__(self, logger, elasticsearch, allowed_sources, **kwargs):
super(Plugin, self).__init__(logger, elasticsearch, allowed_sources)
self.plugin_name = 'simple-html'
self.plugin_name = "simple-html"
self.logger.info(f"Initializing {self.plugin_name}")
self.timeout = int(kwargs['timeout'])
self.retries = int(kwargs['retries'])
interesting = kwargs['interestingKeywords'].split(',')
self.interesting = re.compile('|'.join([re.escape(word) for word in interesting]), re.IGNORECASE)
self.proxy = kwargs['socks5']
self.torControl = kwargs['TorController']
self.headers ={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'DNT': '1', 'Connection':
'keep-alive',
'Upgrade-Insecure-Requests': '1'}
self.timeout = int(kwargs["timeout"])
self.retries = int(kwargs["retries"])
interesting = kwargs["interestingKeywords"].split(",")
self.interesting = re.compile(
"|".join([re.escape(word) for word in interesting]), re.IGNORECASE
)
self.proxy = kwargs["socks5"]
self.torControl = kwargs["TorController"]
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
def get_tor_session(self):
try:
@ -54,69 +57,84 @@ class Plugin(Operator):
return s
def renew_connection(self):
with Controller.from_port(port = self.torControl['port']) as controller:
with Controller.from_port(port=self.torControl["port"]) as controller:
# Now we switch TOR identities to make sure we have a good connection
self.logger.info('Getting new Tor IP')
self.logger.info("Getting new Tor IP")
# authenticate to our local TOR controller
controller.authenticate(self.torControl['password'])
controller.authenticate(self.torControl["password"])
# send the signal for a new identity
controller.signal(Signal.NEWNYM)
# wait for the new identity to be initialized
time.sleep(controller.get_newnym_wait())
session = self.get_tor_session()
self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
self.logger.info(
f"IP is {session.get('http://httpbin.org/ip').json()['origin']}"
)
def run_sessions(self, onion):
retry = 0
result = None
while True:
try:
url = 'http://'+onion
self.logger.info(url)
content = self.get_tor_session().get(url)
if content.status_code == 200:
result = content.text
if result:
html = BeautifulSoup(result,features="lxml")
# testing hardcorded filepath
with open("/home/tony/Projects/OnionScraper_v2/onion_master_list.txt", "w") as fp:
for onion in re.findall('([a-z2-7]{16,56}\.onion)',result):
fp.write("%s\n" % onion)
if html:
index = {
'HTML':result,
'title':html.title.text,
'language':detect(html.text),
'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z',
'status':'success',
'interestingKeywords':list(set(self.interesting.findall(result)))
}
else:
index = {
'HTML':result,
'title': None,
'language': None,
'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z',
'status':'success',
'interestingKeywords':list(set(self.interesting.findall(result)))
}
return self.response(index, onion, self.plugin_name)
except requests.exceptions.ConnectionError as connection_error:
self.logger.error(f'Failed connecting to http://{url}')
self.logger.debug(connection_error)
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
self.logger.info('[x] No results found retrying ...')
retry += 1
self.renew_connection()
if retry > self.retries:
self.logger.error('[x] Max retries exceeded')
return self.response({'status':"failure"}, onion, self.plugin_name)
retry = 0
result = None
while True:
try:
url = "http://" + onion
self.logger.info(url)
content = self.get_tor_session().get(url)
if content.status_code == 200:
result = content.text
if result:
html = BeautifulSoup(result, features="lxml")
# testing hardcorded filepath
with open(
"/home/tony/Projects/OnionScraper_v2/onion_master_list.txt",
"w",
) as fp:
for onion in re.findall("([a-z2-7]{16,56}\.onion)", result):
fp.write("%s\n" % onion)
if html:
index = {
"HTML": result,
"title": html.title.text,
"language": detect(html.text),
"date-crawled": dt.utcnow().strftime(
"%Y-%m-%dT%H:%M:%S.%f"
)
+ "Z",
"status": "success",
"interestingKeywords": list(
set(self.interesting.findall(result))
),
}
else:
index = {
"HTML": result,
"title": None,
"language": None,
"date-crawled": dt.utcnow().strftime(
"%Y-%m-%dT%H:%M:%S.%f"
)
+ "Z",
"status": "success",
"interestingKeywords": list(
set(self.interesting.findall(result))
),
}
return self.response(index, onion, self.plugin_name)
except requests.exceptions.ConnectionError as connection_error:
self.logger.error(f"Failed connecting to http://{url}")
self.logger.debug(connection_error)
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
self.logger.info("[x] No results found retrying ...")
retry += 1
self.renew_connection()
if retry > self.retries:
self.logger.error("[x] Max retries exceeded")
return self.response({"status": "failure"}, onion, self.plugin_name)
def handle_onion(self, db, onion):
content = self.run_sessions(onion)
if content[self.plugin_name]['status'] == 'success':
if self._onion_is_allowed(content, db, 'HTML'):
self.es.update(db['_id'], content)
if content[self.plugin_name]["status"] == "success":
if self._onion_is_allowed(content, db, "HTML"):
self.es.update(db["_id"], content)

@ -24,6 +24,7 @@ from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from onionscraper.operators import Operator
class Plugin(Operator):
"""OnionScraper main work logic.
@ -49,8 +50,8 @@ class Plugin(Operator):
'Accept-Language':'en-US,en;q=0.5',
'DNT': '1', 'Connection':
'keep-alive',
'Upgrade-Insecure-Requests': '1'}
'Upgrade-Insecure-Requests': '1'
}
blacklist = kwargs['blacklist'].split(',')
self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE)
@ -110,9 +111,10 @@ class Plugin(Operator):
options = Options()
options.headless = True
driver = webdriver.Firefox(
executable_path='/home/tony/Projects/OnionScraper/geckodriver',
options=options,
firefox_profile=fp)
executable_path='/home/tony/Projects/OnionScraper/geckodriver',
options=options,
firefox_profile=fp
)
url = 'http://' + onion
driver.get(url)
uid = str(uuid4()).split('-')[0]
@ -131,8 +133,6 @@ class Plugin(Operator):
self.logger.error('[x] Unable to take screenshot')
return self.response("failure",None,onion)
def get_tor_session(self):
try:
s = requests.session()
@ -172,31 +172,31 @@ class Plugin(Operator):
return
def run_sessions(self, onion):
retry = 0
result = None
while True:
try:
url = 'http://'+onion
self.logger.info(url)
content = self.session.get(url)
if content.status_code == 200:
result = content.json()
except JSONDecodeError as e:
self.logger.debug(f'JSONDecodeError {e}')
result = content.text
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
finally:
if result:
return self.response("success",result,onion)
else:
self.logger.info('[x] No results found retrying ...')
retry += 1
self.renew_connection()
if retry > self.retries:
self.logger.error('[x] Max retries exceeded')
return self.response("failure",None, onion)
retry = 0
result = None
while True:
try:
url = 'http://'+onion
self.logger.info(url)
content = self.session.get(url)
if content.status_code == 200:
result = content.json()
except JSONDecodeError as e:
self.logger.debug(f'JSONDecodeError {e}')
result = content.text
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
finally:
if result:
return self.response("success",result,onion)
else:
self.logger.info('[x] No results found retrying ...')
retry += 1
self.renew_connection()
if retry > self.retries:
self.logger.error('[x] Max retries exceeded')
return self.response("failure",None, onion)
def run_onionscan(self, onion):
self.logger.info("[*] Running onionscan on %s", onion)
@ -259,6 +259,3 @@ class Plugin(Operator):
finally:
pass
#sys.exit(0)

@ -1,6 +1,7 @@
from onionscraper.operators import Operator
class Plugin(Operator):
"""Operator for output to flat CSV file."""
def __init__(self, filename, base_score):

Loading…
Cancel
Save