renamed the package and fixed some issues
parent
23dd01a983
commit
8258056bef
@ -0,0 +1,131 @@
|
|||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
import collections
|
||||||
|
|
||||||
|
from . import config
|
||||||
|
from . import dbhandler
|
||||||
|
from . import loghandler
|
||||||
|
|
||||||
|
|
||||||
|
class Ingestor:
|
||||||
|
"""ThreatIngestor main work logic.
|
||||||
|
|
||||||
|
Handles reading the config file, calling sources, maintaining state, and
|
||||||
|
sending artifacts to operators.
|
||||||
|
"""
|
||||||
|
def __init__(self, args):
|
||||||
|
# Load logger
|
||||||
|
log = loghandler.LoggerHandler(args.logLevel)
|
||||||
|
self.logger = log.start_logging()
|
||||||
|
# Load config
|
||||||
|
self.config = config.Config(args.configFile, self.logger)
|
||||||
|
self.blacklist = self.config.blacklist()
|
||||||
|
|
||||||
|
# Load Elasticsearch.
|
||||||
|
try:
|
||||||
|
self.es = dbhandler.DbHandlerElasticSearch(
|
||||||
|
self.config.elasticsearch(),
|
||||||
|
self.logger)
|
||||||
|
except Exception as e:
|
||||||
|
# Error loading elasticsearch.
|
||||||
|
self.logger.error(e)
|
||||||
|
self.logger.debug(traceback.print_exc())
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
# Instantiate plugins.
|
||||||
|
try:
|
||||||
|
self.logger.info("Initializing sources")
|
||||||
|
self.sources = {name: source(self.logger, **kwargs)
|
||||||
|
for name, source, kwargs in self.config.sources()}
|
||||||
|
|
||||||
|
self.logger.info("initializing operators")
|
||||||
|
self.operators = {name: operator(self.logger, self.es, self.blacklist, **kwargs)
|
||||||
|
for name, operator, kwargs in self.config.operators()}
|
||||||
|
|
||||||
|
self.logger.info("initializing notifiers")
|
||||||
|
#self.notifiers = {name: operator(**kwargs)
|
||||||
|
# for name, operator, kwargs in self.config.notifiers()}
|
||||||
|
except Exception as e:
|
||||||
|
# Error loading elasticsearch.
|
||||||
|
self.logger.error(e)
|
||||||
|
self.logger.debug(traceback.print_exc())
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""Run once, or forever, depending on config."""
|
||||||
|
if self.config.daemon():
|
||||||
|
self.logger.info("Running forever, in a loop")
|
||||||
|
self.run_forever()
|
||||||
|
else:
|
||||||
|
self.logger.info("Running once, to completion")
|
||||||
|
self.run_once()
|
||||||
|
|
||||||
|
|
||||||
|
def run_once(self):
|
||||||
|
"""Run each source once, passing artifacts to each operator."""
|
||||||
|
# Track some statistics about artifacts in a summary object.
|
||||||
|
summary = collections.Counter()
|
||||||
|
|
||||||
|
for source in self.sources:
|
||||||
|
# Run the source to collect artifacts.
|
||||||
|
self.logger.info(f"Running source '{source}'")
|
||||||
|
try:
|
||||||
|
onions = self.sources[source].run()
|
||||||
|
if onions:
|
||||||
|
self.logger.info(f'Found hidden links')
|
||||||
|
else:
|
||||||
|
self.logger.info('No links found')
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(e)
|
||||||
|
self.logger.error(traceback.print_exc())
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Process artifacts with each operator.
|
||||||
|
for operator in self.operators:
|
||||||
|
self.logger.info(f"Processing found onions with operator '{operator}'")
|
||||||
|
try:
|
||||||
|
doc = self.operators[operator].process(onions)
|
||||||
|
# Save the source state.
|
||||||
|
self.es.save(doc)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(e)
|
||||||
|
self.logger.error(traceback.print_exc())
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Record stats and update the summary.
|
||||||
|
# types = artifact_types(doc.get('interestingKeywords'))
|
||||||
|
# summary.update(types)
|
||||||
|
# for artifact_type in types:
|
||||||
|
# self.logger.info(f'types[artifact_type]')
|
||||||
|
|
||||||
|
# Log the summary.
|
||||||
|
self.logger.info(f"New artifacts: {dict(summary)}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_forever(self):
|
||||||
|
"""Run forever, sleeping for the configured interval between each run."""
|
||||||
|
while True:
|
||||||
|
self.run_once()
|
||||||
|
|
||||||
|
self.logger.info(f"Sleeping for {self.config.sleep()} seconds")
|
||||||
|
time.sleep(self.config.sleep())
|
||||||
|
|
||||||
|
|
||||||
|
def artifact_types(artifact_list):
|
||||||
|
"""Return a dictionary with counts of each artifact type."""
|
||||||
|
types = {}
|
||||||
|
for artifact in artifact_list:
|
||||||
|
artifact_type = artifact.__class__.__name__.lower()
|
||||||
|
if artifact_type in types:
|
||||||
|
types[artifact_type] += 1
|
||||||
|
else:
|
||||||
|
types[artifact_type] = 1
|
||||||
|
|
||||||
|
return types
|
||||||
|
|
||||||
|
|
@ -0,0 +1,49 @@
|
|||||||
|
"""OnionScraper
|
||||||
|
|
||||||
|
A Python3 application for indexing and scraping hidden services ElasticSearch
|
||||||
|
|
||||||
|
Installation:
|
||||||
|
This application assumes you have python3 and pip3 installed.
|
||||||
|
|
||||||
|
pip3 install -r requirements.txt
|
||||||
|
|
||||||
|
|
||||||
|
This software is provided subject to the MIT license stated below.
|
||||||
|
--------------------------------------------------
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
--------------------------------------------------
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from onioningestor import Ingestor
|
||||||
|
|
||||||
|
|
||||||
|
# Load arguments from user
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog='onionscraper',
|
||||||
|
description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
parser.add_argument('-c', '--config',dest="configFile", required = True, help='Path to config file')
|
||||||
|
parser.add_argument("--log", dest="logLevel",default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the logging level, default is INFO")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
app = Ingestor(args)
|
||||||
|
|
||||||
|
app.run()
|
@ -0,0 +1,172 @@
|
|||||||
|
import io
|
||||||
|
import importlib
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
SOURCE = 'onioningestor.sources'
|
||||||
|
OPERATOR = 'onioningestor.operators'
|
||||||
|
|
||||||
|
INTERNAL_OPTIONS = [
|
||||||
|
'saved_state',
|
||||||
|
'module',
|
||||||
|
'credentials',
|
||||||
|
]
|
||||||
|
|
||||||
|
ARTIFACT_TYPES = 'artifact_types'
|
||||||
|
FILTER_STRING = 'filter'
|
||||||
|
ALLOWED_SOURCES = 'allowed_sources'
|
||||||
|
NAME = 'name'
|
||||||
|
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
"""Config read/write operations, and convenience methods."""
|
||||||
|
def __init__(self, filename, logger):
|
||||||
|
"""Read a config file."""
|
||||||
|
self.logger = logger
|
||||||
|
self.filename = filename
|
||||||
|
with io.open(self.filename, 'r') as f:
|
||||||
|
try:
|
||||||
|
self.logger.info("Loading config file")
|
||||||
|
self.config = yaml.safe_load(f.read())
|
||||||
|
except yaml.error.YAMLError:
|
||||||
|
self.logger.error("YAML error in config")
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _load_plugin(plugin_type, plugin):
|
||||||
|
"""Returns plugin class or raises an exception.
|
||||||
|
:raises: threatingestor.exceptions.PluginError
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
module = importlib.import_module('.'.join([plugin_type, plugin]))
|
||||||
|
return module.Plugin
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
print(traceback.print_exc())
|
||||||
|
|
||||||
|
def daemon(self):
|
||||||
|
"""Returns boolean, are we daemonizing?"""
|
||||||
|
return self.config['general']['daemon']
|
||||||
|
|
||||||
|
|
||||||
|
def elasticsearch(self):
|
||||||
|
"""Returns elasticsaerch config"""
|
||||||
|
return self.config['general']['elasticsearch']
|
||||||
|
|
||||||
|
|
||||||
|
def sleep(self):
|
||||||
|
"""Returns number of seconds to sleep between iterations, if daemonizing."""
|
||||||
|
return self.config['general']['sleep']
|
||||||
|
|
||||||
|
def blacklist(self):
|
||||||
|
return self.config['general']['blacklist'].split(',')
|
||||||
|
|
||||||
|
# def onionscanner(self):
|
||||||
|
# """Returns onionscanner config dict"""
|
||||||
|
# screenshots = self.config['onionscanner'].pop('screenshots_path', None)
|
||||||
|
# if screenshots:
|
||||||
|
# self.config['onionscanner']['screenshots_path'] = Path(screenshots)
|
||||||
|
# else:
|
||||||
|
# self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots'
|
||||||
|
# blacklist = self.config['onionscanner'].pop('blacklist', None)
|
||||||
|
# if blacklist:
|
||||||
|
# self.config['onionscanner']['blacklist'] = blacklist.split(',')
|
||||||
|
# interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None)
|
||||||
|
# if interestingKeywords:
|
||||||
|
# self.config['onionscanner']['interestingKeywords'] = blacklist.split(',')
|
||||||
|
# return self.config['onionscanner']
|
||||||
|
|
||||||
|
|
||||||
|
def notifiers(self):
|
||||||
|
"""Returns notifiers config dictionary."""
|
||||||
|
return self.config.get('notifiers', {})
|
||||||
|
|
||||||
|
|
||||||
|
def logging(self):
|
||||||
|
"""Returns logging config dictionary."""
|
||||||
|
return self.config.get('logging', {})
|
||||||
|
|
||||||
|
|
||||||
|
def credentials(self, credential_name):
|
||||||
|
"""Return a dictionary with the specified credentials."""
|
||||||
|
for credential in self.config['credentials']:
|
||||||
|
for key, value in credential.items():
|
||||||
|
if key == NAME and value == credential_name:
|
||||||
|
return credential
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def sources(self):
|
||||||
|
"""Return a list of (name, Source class, {kwargs}) tuples.
|
||||||
|
:raises: threatingestor.exceptions.PluginError
|
||||||
|
"""
|
||||||
|
sources = []
|
||||||
|
|
||||||
|
for source in self.config['sources']:
|
||||||
|
kwargs = {}
|
||||||
|
for key, value in source.items():
|
||||||
|
if key not in INTERNAL_OPTIONS:
|
||||||
|
kwargs[key] = value
|
||||||
|
|
||||||
|
elif key == 'credentials':
|
||||||
|
# Grab these named credentials
|
||||||
|
credential_name = value
|
||||||
|
for credential_key, credential_value in self.credentials(credential_name).items():
|
||||||
|
if credential_key != NAME:
|
||||||
|
kwargs[credential_key] = credential_value
|
||||||
|
|
||||||
|
# load and initialize the plugin
|
||||||
|
self.logger.info(f"Found source '{source[NAME]}'")
|
||||||
|
sources.append((source[NAME], self._load_plugin(SOURCE, source['module']), kwargs))
|
||||||
|
|
||||||
|
self.logger.info(f"Found {len(sources)} total sources")
|
||||||
|
return sources
|
||||||
|
|
||||||
|
|
||||||
|
def operators(self):
|
||||||
|
"""Return a list of (name, Operator class, {kwargs}) tuples.
|
||||||
|
:raises: threatingestor.exceptions.PluginError
|
||||||
|
"""
|
||||||
|
operators = []
|
||||||
|
for operator in self.config['operators']:
|
||||||
|
kwargs = {}
|
||||||
|
for key, value in operator.items():
|
||||||
|
if key not in INTERNAL_OPTIONS:
|
||||||
|
if key == ARTIFACT_TYPES:
|
||||||
|
# parse out special artifact_types option
|
||||||
|
artifact_types = []
|
||||||
|
for artifact in value:
|
||||||
|
try:
|
||||||
|
artifact_types.append(threatingestor.artifacts.STRING_MAP[artifact.lower().strip()])
|
||||||
|
except KeyError:
|
||||||
|
# ignore invalid artifact types
|
||||||
|
pass
|
||||||
|
kwargs[key] = artifact_types
|
||||||
|
|
||||||
|
elif key == FILTER_STRING:
|
||||||
|
# pass in special filter_string option
|
||||||
|
kwargs['filter_string'] = value
|
||||||
|
|
||||||
|
elif key == NAME:
|
||||||
|
# exclude name key from operator kwargs, since it's not used
|
||||||
|
pass
|
||||||
|
|
||||||
|
else:
|
||||||
|
kwargs[key] = value
|
||||||
|
|
||||||
|
elif key == 'credentials':
|
||||||
|
# Grab these named credentials
|
||||||
|
credential_name = value
|
||||||
|
for credential_key, credential_value in self.credentials(credential_name).items():
|
||||||
|
if credential_key != NAME:
|
||||||
|
kwargs[credential_key] = credential_value
|
||||||
|
|
||||||
|
# load and initialize the plugin
|
||||||
|
self.logger.info(f"Found operator '{operator[NAME]}'")
|
||||||
|
operators.append((operator[NAME], self._load_plugin(OPERATOR, operator['module']), kwargs))
|
||||||
|
|
||||||
|
self.logger.info(f"Found {len(operators)} total operators")
|
||||||
|
return operators
|
@ -0,0 +1,75 @@
|
|||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
from elasticsearch import Elasticsearch, helpers
|
||||||
|
|
||||||
|
class DbHandlerElasticSearch:
|
||||||
|
def __init__(self, config, logger):
|
||||||
|
self.logger = logger
|
||||||
|
self.logger.info('Creating Elasticsearch mapping')
|
||||||
|
self.config = config
|
||||||
|
self.mapping = '''
|
||||||
|
{
|
||||||
|
"mappings": {
|
||||||
|
"_doc": {
|
||||||
|
"properties": {
|
||||||
|
"hiddenService": {
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
"blacklist": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"monitor": {
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"simple-html": {
|
||||||
|
"type": "nested",
|
||||||
|
"properties": {
|
||||||
|
"HTML": {
|
||||||
|
"type": "long"
|
||||||
|
},
|
||||||
|
"title": {
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
"language": {
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
"status":{
|
||||||
|
"type":"text"
|
||||||
|
},
|
||||||
|
"date-indexed": {
|
||||||
|
"type": "date"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
self.index = self.config['index']
|
||||||
|
try:
|
||||||
|
self.es = Elasticsearch([{
|
||||||
|
'host':self.config['host'],
|
||||||
|
'port':self.config['port']}])
|
||||||
|
self.es.indices.create(
|
||||||
|
index=self.index,
|
||||||
|
body=self.mapping,
|
||||||
|
ignore=400)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(e)
|
||||||
|
self.logger.error(traceback.format_exc())
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
def count(self):
|
||||||
|
self.es.indices.refresh(self.index)
|
||||||
|
status = self.es.count(index=self.index)
|
||||||
|
if status['_shards']['successful'] == 1:
|
||||||
|
self.logger.info('Successful')
|
||||||
|
self.logger.info('Count:%d',status['count'])
|
||||||
|
else:
|
||||||
|
self.logger.error(status)
|
||||||
|
|
||||||
|
def save(self, doc):
|
||||||
|
self.es.index(index=self.index,body=doc)
|
||||||
|
self.count()
|
@ -0,0 +1,33 @@
|
|||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
class LoggerHandler():
|
||||||
|
def __init__(self, level):
|
||||||
|
self.level = getattr(logging, level)
|
||||||
|
self.logger = logging.getLogger("OnionScraper")
|
||||||
|
self.logger.setLevel(self.level)
|
||||||
|
|
||||||
|
# create console handler and set level to debug
|
||||||
|
ch = logging.StreamHandler()
|
||||||
|
ch.setLevel(self.level)
|
||||||
|
|
||||||
|
# create file logging
|
||||||
|
logFile = Path(__file__).parents[1]
|
||||||
|
logging_path = os.path.join(logFile, "info.log")
|
||||||
|
fh = logging.FileHandler(logging_path)
|
||||||
|
|
||||||
|
# create formatter
|
||||||
|
formatter = logging.Formatter('[%(asctime)s] - %(name)s - %(levelname)s - %(message)s',datefmt='%a, %d %b %Y %H:%M:%S')
|
||||||
|
formatter_console = logging.Formatter('[%(asctime)s] - %(levelname)s - %(message)s',datefmt='%d %b %Y %H:%M:%S')
|
||||||
|
# add formatter to ch
|
||||||
|
ch.setFormatter(formatter_console)
|
||||||
|
fh.setFormatter(formatter)
|
||||||
|
# add ch to logger
|
||||||
|
self.logger.addHandler(ch) #added logging into console
|
||||||
|
self.logger.addHandler(fh) #added logging into file
|
||||||
|
|
||||||
|
def start_logging(self):
|
||||||
|
self.logger.info('Starting OnionScraper')
|
||||||
|
return self.logger
|
||||||
|
|
@ -0,0 +1,95 @@
|
|||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class Operator:
|
||||||
|
"""Base class for all Operator plugins.
|
||||||
|
|
||||||
|
Note: This is an abstract class. You must extend ``__init__`` and call
|
||||||
|
``super`` to ensure this class's constructor is called. You must override
|
||||||
|
``handle_artifact`` with the same signature. You may define additional
|
||||||
|
``handle_{artifact_type}`` methods as needed (see the threatkb operator for
|
||||||
|
an example) - these methods are purely convention, and are not required.
|
||||||
|
|
||||||
|
When adding additional methods to child classes, consider prefixing the
|
||||||
|
method name with an underscore to denote a ``_private_method``. Do not
|
||||||
|
override other existing methods from this class.
|
||||||
|
"""
|
||||||
|
def __init__(self, logger, elasticsearch, allowed_sources=None):
|
||||||
|
"""Override this constructor in child classes.
|
||||||
|
|
||||||
|
The arguments above (artifact_types, filter_string, allowed_sources)
|
||||||
|
should be accepted explicity as above, in all child classes.
|
||||||
|
|
||||||
|
Additional arguments should be added: url, auth, etc, whatever is
|
||||||
|
needed to set up the object.
|
||||||
|
|
||||||
|
Each operator should default self.artifact_types to a list of Artifacts
|
||||||
|
supported by the plugin, and allow passing in artifact_types to
|
||||||
|
overwrite that default.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> self.artifact_types = artifact_types or [
|
||||||
|
... artifacts.IPAddress,
|
||||||
|
... artifacts.Domain,
|
||||||
|
... ]
|
||||||
|
|
||||||
|
It's recommended to call this __init__ method via super from all child
|
||||||
|
classes. Remember to do so *before* setting any default artifact_types.
|
||||||
|
"""
|
||||||
|
self.logger = logger
|
||||||
|
self.blacklist = re.compile('|'.join([re.escape(word) for word in allowed_sources]), re.IGNORECASE)
|
||||||
|
self.es = elasticsearch
|
||||||
|
|
||||||
|
def response(self, content, onion, operator_name):
|
||||||
|
"""
|
||||||
|
status: success/failure
|
||||||
|
content: dict
|
||||||
|
onion: str
|
||||||
|
return: dict
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return {operator_name: json.loads(str(content)), 'hiddenService': onion}
|
||||||
|
except json.decoder.JSONDecodeError as e:
|
||||||
|
self.logger.info('JosnDecode Error')
|
||||||
|
return {operator_name: content, 'hiddenService': onion}
|
||||||
|
#except TypeError:
|
||||||
|
# return {operator_name: None, 'hiddenService': onion}
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(e)
|
||||||
|
|
||||||
|
def handle_onion(self, url):
|
||||||
|
"""Override with the same signature.
|
||||||
|
|
||||||
|
:param artifact: A single ``Artifact`` object.
|
||||||
|
:returns: None (always ignored)
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
def _onion_is_allowed(self, response, type='URL'):
|
||||||
|
"""Returns True if this is allowed by this plugin's filters."""
|
||||||
|
# Must be in allowed_sources, if set.
|
||||||
|
if type == 'URL':
|
||||||
|
print(response)
|
||||||
|
blacklist = self.blacklist.findall(response['hiddenService'])
|
||||||
|
elif type == 'HTML':
|
||||||
|
response['simple-html'].pop('status')
|
||||||
|
response['simple-html']['status'] = 'blocked'
|
||||||
|
blacklist = self.blacklist.findall(response['simple-html']['HTML'])
|
||||||
|
if blacklist:
|
||||||
|
self.es.save(response)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def process(self, onions):
|
||||||
|
"""Process all applicable onions."""
|
||||||
|
for onion in onions:
|
||||||
|
if self._onion_is_allowed(
|
||||||
|
self.response({'status':'blocked'},onion.url,'regex-blacklist'),
|
||||||
|
type='URL'):
|
||||||
|
self.handle_onion(onion.url)
|
||||||
|
|
@ -0,0 +1,99 @@
|
|||||||
|
import time
|
||||||
|
import json
|
||||||
|
import traceback
|
||||||
|
from datetime import datetime as dt
|
||||||
|
from json.decoder import JSONDecodeError
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from langdetect import detect
|
||||||
|
|
||||||
|
from stem.control import Controller
|
||||||
|
from stem import Signal
|
||||||
|
|
||||||
|
from onioningestor.operators import Operator
|
||||||
|
|
||||||
|
|
||||||
|
class Plugin(Operator):
|
||||||
|
"""Simple-html
|
||||||
|
This plugin collects HTML code from onion link
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, logger, elasticsearch, allowed_sources, **kwargs):
|
||||||
|
super(Plugin, self).__init__(logger, elasticsearch, allowed_sources)
|
||||||
|
self.plugin_name = 'simple-html'
|
||||||
|
self.logger.info(f"Initializing {self.plugin_name}")
|
||||||
|
|
||||||
|
self.timeout = int(kwargs['timeout'])
|
||||||
|
self.retries = int(kwargs['retries'])
|
||||||
|
|
||||||
|
self.proxy = kwargs['socks5']
|
||||||
|
self.torControl = kwargs['TorController']
|
||||||
|
self.headers ={
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
'Accept-Language':'en-US,en;q=0.5',
|
||||||
|
'DNT': '1', 'Connection':
|
||||||
|
'keep-alive',
|
||||||
|
'Upgrade-Insecure-Requests': '1'}
|
||||||
|
|
||||||
|
def get_tor_session(self):
|
||||||
|
try:
|
||||||
|
s = requests.session()
|
||||||
|
s.proxies = self.proxy
|
||||||
|
s.headers.update(self.headers)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(e)
|
||||||
|
self.logger.debug(traceback.print_exc())
|
||||||
|
return s
|
||||||
|
|
||||||
|
def renew_connection(self):
|
||||||
|
with Controller.from_port(port = self.torControl['port']) as controller:
|
||||||
|
# Now we switch TOR identities to make sure we have a good connection
|
||||||
|
self.logger.info('Getting new Tor IP')
|
||||||
|
# authenticate to our local TOR controller
|
||||||
|
controller.authenticate(self.torControl['password'])
|
||||||
|
# send the signal for a new identity
|
||||||
|
controller.signal(Signal.NEWNYM)
|
||||||
|
# wait for the new identity to be initialized
|
||||||
|
time.sleep(controller.get_newnym_wait())
|
||||||
|
session = self.get_tor_session()
|
||||||
|
self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
|
||||||
|
|
||||||
|
def run_sessions(self, onion):
|
||||||
|
retry = 0
|
||||||
|
result = None
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
url = 'http://'+onion
|
||||||
|
self.logger.info(url)
|
||||||
|
content = self.get_tor_session().get(url)
|
||||||
|
if content.status_code == 200:
|
||||||
|
result = content.text
|
||||||
|
if result:
|
||||||
|
html = BeautifulSoup(result,features="lxml")
|
||||||
|
index = {'HTML':result,'title':html.title.text,'language':detect(html.text),'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z','status':'success'}
|
||||||
|
return self.response(index, onion, self.plugin_name)
|
||||||
|
except requests.exceptions.ConnectionError as connection_error:
|
||||||
|
self.logger.error(f'Failed connecting to http://{url}')
|
||||||
|
self.logger.debug(connection_error)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(e)
|
||||||
|
self.logger.debug(traceback.print_exc())
|
||||||
|
|
||||||
|
self.logger.info('[x] No results found retrying ...')
|
||||||
|
retry += 1
|
||||||
|
self.renew_connection()
|
||||||
|
if retry > self.retries:
|
||||||
|
self.logger.error('[x] Max retries exceeded')
|
||||||
|
return self.response({'status':"failure"}, onion, self.plugin_name)
|
||||||
|
|
||||||
|
def handle_onion(self, onion):
|
||||||
|
content = self.run_sessions(onion)
|
||||||
|
print(content)
|
||||||
|
if content[self.plugin_name]['status'] == 'success':
|
||||||
|
if self._onion_is_allowed(content):
|
||||||
|
self.es.save(content)
|
||||||
|
|
@ -0,0 +1,264 @@
|
|||||||
|
import re
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
import traceback
|
||||||
|
import subprocess
|
||||||
|
from uuid import uuid4
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime as dt
|
||||||
|
from json.decoder import JSONDecodeError
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
from threading import Timer
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from stem.control import Controller
|
||||||
|
from stem import Signal
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.firefox.options import Options
|
||||||
|
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
|
||||||
|
|
||||||
|
from onionscraper.operators import Operator
|
||||||
|
|
||||||
|
class Plugin(Operator):
|
||||||
|
"""OnionScraper main work logic.
|
||||||
|
|
||||||
|
Handles reading the config file, calling sources, maintaining state and
|
||||||
|
sending artifacts to operators.
|
||||||
|
"""
|
||||||
|
def __init__(self, logger, **kwargs):
|
||||||
|
self.logger = logger
|
||||||
|
self.logger.info('Initializing OnionScanner')
|
||||||
|
screenshots = kwargs.pop('screenshots_path', None)
|
||||||
|
if screenshots:
|
||||||
|
self.screenshots = Path(screenshots)
|
||||||
|
else:
|
||||||
|
self.screenshots = Path(__file__).parents[1]/'screenshots'
|
||||||
|
self.onionscan = kwargs['binpath']
|
||||||
|
self.timeout = int(kwargs['timeout'])
|
||||||
|
self.proxy = kwargs['socks5']
|
||||||
|
self.torControl = kwargs['TorController']
|
||||||
|
self.retries = int(kwargs['retries'])
|
||||||
|
self.headers ={
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
'Accept-Language':'en-US,en;q=0.5',
|
||||||
|
'DNT': '1', 'Connection':
|
||||||
|
'keep-alive',
|
||||||
|
'Upgrade-Insecure-Requests': '1'}
|
||||||
|
|
||||||
|
|
||||||
|
blacklist = kwargs['blacklist'].split(',')
|
||||||
|
self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE)
|
||||||
|
keywords = kwargs['interestingKeywords'].split(',')
|
||||||
|
self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE)
|
||||||
|
self.session = self.get_tor_session()
|
||||||
|
|
||||||
|
def response(self, status, content, onion):
|
||||||
|
"""
|
||||||
|
status: success/failure
|
||||||
|
content: dict
|
||||||
|
onion: str
|
||||||
|
return: dict
|
||||||
|
"""
|
||||||
|
return {'status': status, 'data': content, 'onion': onion}
|
||||||
|
|
||||||
|
def parseDoc(self, data):
|
||||||
|
data['onionscan'].pop('simpleReport', None)
|
||||||
|
crawls = data['onionscan'].pop('crawls', None)
|
||||||
|
hiddenService = data['onionscan'].pop('hiddenService', None)
|
||||||
|
data['onionscan']['crawls'] = [*crawls]
|
||||||
|
data['hiddenService'] = hiddenService
|
||||||
|
for onion in crawls.keys():
|
||||||
|
print(onion)
|
||||||
|
with open('/home/tony/Projects/OnionScraper_v2/onion_master_list.txt', 'a') as fp:
|
||||||
|
fp.write("%s\n" % onion)
|
||||||
|
#q.enqueue(self.crawl, onion)
|
||||||
|
#with open('test.json', 'w', encoding='utf-8') as f:
|
||||||
|
# json.dump(data, f, ensure_ascii=False, indent=4)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def format_directory(self, directory):
|
||||||
|
d = dt.now()
|
||||||
|
year = str(d.year)
|
||||||
|
month = str(d.month)
|
||||||
|
# prefix month and day with "0" if it is only one digit
|
||||||
|
if len(month) < 2:
|
||||||
|
month = "0" + month
|
||||||
|
day = str(d.day)
|
||||||
|
if len(day) < 2:
|
||||||
|
day = "0" + day
|
||||||
|
save_path = directory/year/month/day
|
||||||
|
if not os.path.isdir(save_path):
|
||||||
|
self.logger.info("[*] Creating directory to save screenshots")
|
||||||
|
os.makedirs(save_path)
|
||||||
|
|
||||||
|
return save_path
|
||||||
|
|
||||||
|
def take_screenshot(self, save_path, onion):
|
||||||
|
binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver')
|
||||||
|
fp = webdriver.FirefoxProfile()
|
||||||
|
fp.set_preference('network.proxy.type', 1)
|
||||||
|
fp.set_preference('network.proxy.socks', '127.0.0.1')
|
||||||
|
fp.set_preference('network.proxy.socks_port', 9050)
|
||||||
|
fp.set_preference('network.proxy.socks_remote_dns', True)
|
||||||
|
|
||||||
|
options = Options()
|
||||||
|
options.headless = True
|
||||||
|
driver = webdriver.Firefox(
|
||||||
|
executable_path='/home/tony/Projects/OnionScraper/geckodriver',
|
||||||
|
options=options,
|
||||||
|
firefox_profile=fp)
|
||||||
|
url = 'http://' + onion
|
||||||
|
driver.get(url)
|
||||||
|
uid = str(uuid4()).split('-')[0]
|
||||||
|
filename = f"{onion}_screenshot_{uid}.png"
|
||||||
|
f_name = f"{save_path}/{filename}"
|
||||||
|
driver.save_screenshot(f_name)
|
||||||
|
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
if os.path.isfile(f_name):
|
||||||
|
self.logger.info(f'[*] Screenshot was taken. {f_name}')
|
||||||
|
dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'
|
||||||
|
result = {'dateScreenshoted':dateScreenshoted,'filename':filename}
|
||||||
|
return self.response("success",result,onion)
|
||||||
|
else:
|
||||||
|
self.logger.error('[x] Unable to take screenshot')
|
||||||
|
return self.response("failure",None,onion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_tor_session(self):
|
||||||
|
try:
|
||||||
|
s = requests.session()
|
||||||
|
s.proxies = self.proxy
|
||||||
|
s.headers.update(self.headers)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(e)
|
||||||
|
self.logger.debug(traceback.print_exc())
|
||||||
|
return s
|
||||||
|
|
||||||
|
# signal TOR for a new connection
|
||||||
|
def renew_connection(self):
|
||||||
|
with Controller.from_port(port = self.torControl['port']) as controller:
|
||||||
|
# Now we switch TOR identities to make sure we have a good connection
|
||||||
|
self.logger.info('Getting new Tor IP')
|
||||||
|
# authenticate to our local TOR controller
|
||||||
|
controller.authenticate(self.torControl['password'])
|
||||||
|
# send the signal for a new identity
|
||||||
|
controller.signal(Signal.NEWNYM)
|
||||||
|
# wait for the new identity to be initialized
|
||||||
|
time.sleep(controller.get_newnym_wait())
|
||||||
|
session = self.get_tor_session()
|
||||||
|
self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
|
||||||
|
|
||||||
|
def handle_timeout(self, process, onion):
|
||||||
|
#
|
||||||
|
# Handle a timeout from the onionscan process.
|
||||||
|
#
|
||||||
|
|
||||||
|
try:
|
||||||
|
# kill the onionscan process
|
||||||
|
process.kill()
|
||||||
|
self.logger.info("[!!!] Killed the onionscan process.")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
self.renew_connection()
|
||||||
|
return
|
||||||
|
|
||||||
|
def run_sessions(self, onion):
|
||||||
|
retry = 0
|
||||||
|
result = None
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
url = 'http://'+onion
|
||||||
|
self.logger.info(url)
|
||||||
|
content = self.session.get(url)
|
||||||
|
if content.status_code == 200:
|
||||||
|
result = content.json()
|
||||||
|
except JSONDecodeError as e:
|
||||||
|
self.logger.debug(f'JSONDecodeError {e}')
|
||||||
|
result = content.text
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(e)
|
||||||
|
self.logger.debug(traceback.print_exc())
|
||||||
|
finally:
|
||||||
|
if result:
|
||||||
|
return self.response("success",result,onion)
|
||||||
|
else:
|
||||||
|
self.logger.info('[x] No results found retrying ...')
|
||||||
|
retry += 1
|
||||||
|
self.renew_connection()
|
||||||
|
if retry > self.retries:
|
||||||
|
self.logger.error('[x] Max retries exceeded')
|
||||||
|
return self.response("failure",None, onion)
|
||||||
|
|
||||||
|
def run_onionscan(self, onion):
|
||||||
|
self.logger.info("[*] Running onionscan on %s", onion)
|
||||||
|
|
||||||
|
# fire up onionscan
|
||||||
|
process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
|
||||||
|
|
||||||
|
# start the timer and let it run till timeout minutes
|
||||||
|
process_timer = Timer(300,self.handle_timeout,args=[process,onion])
|
||||||
|
process_timer.start()
|
||||||
|
|
||||||
|
# wait for the onion scan results
|
||||||
|
stdout = process.communicate()[0]
|
||||||
|
|
||||||
|
# we have received valid results so we can kill the timer
|
||||||
|
if process_timer.is_alive():
|
||||||
|
process_timer.cancel()
|
||||||
|
try:
|
||||||
|
return self.response("success",json.loads(stdout),onion)
|
||||||
|
except json.decoder.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.logger.info("[!!!] Process timed out for %s", onion)
|
||||||
|
|
||||||
|
return self.response("failure",None, onion)
|
||||||
|
|
||||||
|
def handle_onion(self, onion_tuple):
|
||||||
|
onion = onion_tuple.url
|
||||||
|
self.logger.info(f'Processing {onion} with onionscan')
|
||||||
|
try:
|
||||||
|
blacklist_URL = self.blacklist.search(onion)
|
||||||
|
if blacklist_URL:
|
||||||
|
self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}")
|
||||||
|
else:
|
||||||
|
self.logger.debug("[*] URL blacklist test: PASSED")
|
||||||
|
results = self.run_onionscan(onion)
|
||||||
|
if results['status'] == 'success':# and results['data']['webDetected'] == 'true':
|
||||||
|
content = self.run_sessions(onion)
|
||||||
|
if content['status'] == 'success':
|
||||||
|
blacklist_CONTENT = self.blacklist.search(content['data'])
|
||||||
|
if blacklist_CONTENT:
|
||||||
|
self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}")
|
||||||
|
else:
|
||||||
|
self.logger.debug("[*] CONTENT blacklist test: PASSED")
|
||||||
|
screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion)
|
||||||
|
self.logger.info("Indexing!")
|
||||||
|
doc = {
|
||||||
|
'onionscan':json.loads(results['data']),
|
||||||
|
'html':content['data'],
|
||||||
|
'screenshots':screenshot['data'],
|
||||||
|
'interestingKeywords':self.interestingKeywords.findall(content['data'])
|
||||||
|
}
|
||||||
|
return self.parseDoc(doc)
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.logger.info(f"[x] hidden service {onion} is not active")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(e)
|
||||||
|
self.logger.error(traceback.print_exc())
|
||||||
|
finally:
|
||||||
|
pass
|
||||||
|
#sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,15 @@
|
|||||||
|
|
||||||
|
from onionscraper.operators import Operator
|
||||||
|
|
||||||
|
class Plugin(Operator):
|
||||||
|
"""Operator for output to flat CSV file."""
|
||||||
|
def __init__(self, filename, base_score):
|
||||||
|
"""CSV operator."""
|
||||||
|
self.filename = filename
|
||||||
|
|
||||||
|
#super(Plugin, self).__init__(artifact_types, filter_string, allowed_sources)
|
||||||
|
|
||||||
|
|
||||||
|
def handle_artifact(self, artifact):
|
||||||
|
"""Operate on a single artifact."""
|
||||||
|
pass
|
@ -0,0 +1,41 @@
|
|||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
class Source(object):
|
||||||
|
"""Base class for all Source plugins.
|
||||||
|
Note: This is an abstract class. You must override ``__init__`` and ``run``
|
||||||
|
in child classes. You should not override ``process_element``. When adding
|
||||||
|
additional methods to child classes, consider prefixing the method name
|
||||||
|
with an underscore to denote a ``_private_method``.
|
||||||
|
"""
|
||||||
|
def __init__(self, name, *args, **kwargs):
|
||||||
|
"""Override this constructor in child classes.
|
||||||
|
The first argument must always be ``name``.
|
||||||
|
Other argumentss should be url, auth, etc, whatever is needed to set
|
||||||
|
up the object.
|
||||||
|
"""
|
||||||
|
self.onion = namedtuple('onion', ['url','source','type'])
|
||||||
|
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""Run and return ``(saved_state, list(Artifact))``.
|
||||||
|
Override this method in child classes.
|
||||||
|
The method signature and return values must remain consistent.
|
||||||
|
The method should attempt to pick up where we left off using
|
||||||
|
``saved_state``, if supported. If ``saved_state`` is ``None``, you can
|
||||||
|
assume this is a first run. If state is maintained by the remote
|
||||||
|
resource (e.g. as it is with SQS), ``saved_state`` should always be
|
||||||
|
``None``.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
def process_element(self, content, reference_link, include_nonobfuscated=False):
|
||||||
|
"""Take a single source content/url and return a list of Artifacts.
|
||||||
|
This is the main work block of Source plugins, which handles
|
||||||
|
IOC extraction and artifact creation.
|
||||||
|
:param content: String content to extract from.
|
||||||
|
:param reference_link: Reference link to attach to all artifacts.
|
||||||
|
:param include_nonobfuscated: Include non-defanged URLs in output?
|
||||||
|
"""
|
||||||
|
logger.debug(f"Processing in source '{self.name}'")
|
||||||
|
|
@ -0,0 +1,153 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__author__ = 'Andrey Glauzer'
|
||||||
|
__license__ = "MIT"
|
||||||
|
__version__ = "1.0.1"
|
||||||
|
__maintainer__ = "Andrey Glauzer"
|
||||||
|
__status__ = "Development"
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
from random import choice
|
||||||
|
import time
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
from onionscraper.sources import Source
|
||||||
|
|
||||||
|
|
||||||
|
class Plugin(Source):
|
||||||
|
|
||||||
|
def __init__(self, logger, name, url):
|
||||||
|
self.logger = logger
|
||||||
|
self.name = name
|
||||||
|
self.url = url
|
||||||
|
self.desktop_agents = [
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
|
||||||
|
super().__init__(self)
|
||||||
|
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.logger.info('Starting Gist Scraper')
|
||||||
|
self.cookies()
|
||||||
|
self.pagination()
|
||||||
|
self.scraping()
|
||||||
|
return self.raw()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def random_headers(self):
|
||||||
|
return {
|
||||||
|
'User-Agent': choice(self.desktop_agents),
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||||
|
}
|
||||||
|
|
||||||
|
def cookies(self):
|
||||||
|
|
||||||
|
self.logger.info('Setting GIST cookies')
|
||||||
|
|
||||||
|
with requests.Session() as self.session:
|
||||||
|
self.headers = self.random_headers
|
||||||
|
|
||||||
|
request = self.session.get(self.url, headers=self.headers)
|
||||||
|
|
||||||
|
if request.status_code == 200:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
self.logger.error('No Response from GIST')
|
||||||
|
|
||||||
|
def pagination(self):
|
||||||
|
request = self.session.get(
|
||||||
|
f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)
|
||||||
|
self.soup = BeautifulSoup(request.content, features="lxml")
|
||||||
|
|
||||||
|
pages = []
|
||||||
|
self.urls = [self.url]
|
||||||
|
try:
|
||||||
|
for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):
|
||||||
|
pages.append(pagination.get_text())
|
||||||
|
except:
|
||||||
|
pages = False
|
||||||
|
|
||||||
|
if pages:
|
||||||
|
cont = 2
|
||||||
|
while cont <= 1: # int(pages[-2]):
|
||||||
|
cont += 1
|
||||||
|
full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"
|
||||||
|
self.urls.append(full_url)
|
||||||
|
|
||||||
|
def scraping(self):
|
||||||
|
url = []
|
||||||
|
for inurl in self.urls:
|
||||||
|
self.logger.info(f"Connecting to {inurl}")
|
||||||
|
time.sleep(5)
|
||||||
|
request = self.session.get(inurl, headers=self.headers)
|
||||||
|
|
||||||
|
if request.status_code == 200:
|
||||||
|
soup = BeautifulSoup(request.content, features="lxml")
|
||||||
|
for code in soup.findAll('div', {'class': 'gist-snippet'}):
|
||||||
|
if '.onion' in code.get_text().lower():
|
||||||
|
for raw in code.findAll('a', {'class': 'link-overlay'}):
|
||||||
|
try:
|
||||||
|
url.append(raw['href'])
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
self.urls_raw = []
|
||||||
|
for get in url:
|
||||||
|
self.logger.info(f"Connecting to {get}")
|
||||||
|
time.sleep(5)
|
||||||
|
try:
|
||||||
|
request = self.session.get(get, headers=self.headers)
|
||||||
|
|
||||||
|
if request.status_code == 200:
|
||||||
|
soup = BeautifulSoup(request.content, features="lxml")
|
||||||
|
|
||||||
|
for raw in soup.findAll('a', {'class': 'btn btn-sm'}):
|
||||||
|
try:
|
||||||
|
gist_url = f"https://gist.githubusercontent.com{raw['href']}"
|
||||||
|
|
||||||
|
self.urls_raw.append(gist_url)
|
||||||
|
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
except(requests.exceptions.ConnectionError,
|
||||||
|
requests.exceptions.ChunkedEncodingError,
|
||||||
|
requests.exceptions.ReadTimeout,
|
||||||
|
requests.exceptions.InvalidURL) as e:
|
||||||
|
self.logger.error(
|
||||||
|
f"I was unable to connect to the url, because an error occurred.\n{e}")
|
||||||
|
pass
|
||||||
|
|
||||||
|
def raw(self):
|
||||||
|
self.logger.info('Performing replaces and regex. WAIT...')
|
||||||
|
itens = []
|
||||||
|
onions = []
|
||||||
|
for raw in self.urls_raw:
|
||||||
|
if '.txt' in raw.lower() \
|
||||||
|
or '.csv' in raw.lower():
|
||||||
|
time.sleep(5)
|
||||||
|
request = self.session.get(raw, headers=self.headers)
|
||||||
|
self.soup = BeautifulSoup(request.content, features="lxml")
|
||||||
|
for pre in self.soup.findAll('body'):
|
||||||
|
list = pre.get_text().split('\n')
|
||||||
|
itens.extend(list)
|
||||||
|
|
||||||
|
regex = re.compile(
|
||||||
|
"[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
|
||||||
|
|
||||||
|
for lines in itens:
|
||||||
|
rurls = lines \
|
||||||
|
.replace('\xad', '') \
|
||||||
|
.replace('\n', '') \
|
||||||
|
.replace("http://", '') \
|
||||||
|
.replace("https://", '') \
|
||||||
|
.replace("www.", "")
|
||||||
|
|
||||||
|
url = regex.match(rurls)
|
||||||
|
|
||||||
|
if url is not None:
|
||||||
|
onions.append(self.onion(url=url.group(), source='gist', type='domain'))
|
||||||
|
return onions
|
@ -0,0 +1,153 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__author__ = 'Daneiele Perera'
|
||||||
|
__license__ = "MIT"
|
||||||
|
__version__ = "1.0.1"
|
||||||
|
__maintainer__ = "Daniele Perera"
|
||||||
|
__status__ = "Development"
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
from random import choice
|
||||||
|
import time
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
from onionscraper.sources import Source
|
||||||
|
|
||||||
|
|
||||||
|
class Plugin(Source):
|
||||||
|
|
||||||
|
def __init__(self, logger, name, url):
|
||||||
|
self.logger = logger
|
||||||
|
self.name = name
|
||||||
|
self.url = url
|
||||||
|
self.desktop_agents = [
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
|
||||||
|
super().__init__(self)
|
||||||
|
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.logger.info('Starting Gist Scraper')
|
||||||
|
self.cookies()
|
||||||
|
self.pagination()
|
||||||
|
self.scraping()
|
||||||
|
return self.raw()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def random_headers(self):
|
||||||
|
return {
|
||||||
|
'User-Agent': choice(self.desktop_agents),
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||||
|
}
|
||||||
|
|
||||||
|
def cookies(self):
|
||||||
|
|
||||||
|
self.logger.info('Setting GIST cookies')
|
||||||
|
|
||||||
|
with requests.Session() as self.session:
|
||||||
|
self.headers = self.random_headers
|
||||||
|
|
||||||
|
request = self.session.get(self.url, headers=self.headers)
|
||||||
|
|
||||||
|
if request.status_code == 200:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
self.logger.error('No Response from GIST')
|
||||||
|
|
||||||
|
def pagination(self):
|
||||||
|
request = self.session.get(
|
||||||
|
f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)
|
||||||
|
self.soup = BeautifulSoup(request.content, features="lxml")
|
||||||
|
|
||||||
|
pages = []
|
||||||
|
self.urls = [self.url]
|
||||||
|
try:
|
||||||
|
for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):
|
||||||
|
pages.append(pagination.get_text())
|
||||||
|
except:
|
||||||
|
pages = False
|
||||||
|
|
||||||
|
if pages:
|
||||||
|
cont = 2
|
||||||
|
while cont <= 1: # int(pages[-2]):
|
||||||
|
cont += 1
|
||||||
|
full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"
|
||||||
|
self.urls.append(full_url)
|
||||||
|
|
||||||
|
def scraping(self):
|
||||||
|
url = []
|
||||||
|
for inurl in self.urls:
|
||||||
|
self.logger.info(f"Connecting to {inurl}")
|
||||||
|
time.sleep(5)
|
||||||
|
request = self.session.get(inurl, headers=self.headers)
|
||||||
|
|
||||||
|
if request.status_code == 200:
|
||||||
|
soup = BeautifulSoup(request.content, features="lxml")
|
||||||
|
for code in soup.findAll('div', {'class': 'gist-snippet'}):
|
||||||
|
if '.onion' in code.get_text().lower():
|
||||||
|
for raw in code.findAll('a', {'class': 'link-overlay'}):
|
||||||
|
try:
|
||||||
|
url.append(raw['href'])
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
self.urls_raw = []
|
||||||
|
for get in url:
|
||||||
|
self.logger.info(f"Connecting to {get}")
|
||||||
|
time.sleep(5)
|
||||||
|
try:
|
||||||
|
request = self.session.get(get, headers=self.headers)
|
||||||
|
|
||||||
|
if request.status_code == 200:
|
||||||
|
soup = BeautifulSoup(request.content, features="lxml")
|
||||||
|
|
||||||
|
for raw in soup.findAll('a', {'class': 'btn btn-sm'}):
|
||||||
|
try:
|
||||||
|
gist_url = f"https://gist.githubusercontent.com{raw['href']}"
|
||||||
|
|
||||||
|
self.urls_raw.append(gist_url)
|
||||||
|
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
except(requests.exceptions.ConnectionError,
|
||||||
|
requests.exceptions.ChunkedEncodingError,
|
||||||
|
requests.exceptions.ReadTimeout,
|
||||||
|
requests.exceptions.InvalidURL) as e:
|
||||||
|
self.logger.error(
|
||||||
|
f"I was unable to connect to the url, because an error occurred.\n{e}")
|
||||||
|
pass
|
||||||
|
|
||||||
|
def raw(self):
|
||||||
|
self.logger.info('Performing replaces and regex. WAIT...')
|
||||||
|
itens = []
|
||||||
|
onions = []
|
||||||
|
for raw in self.urls_raw:
|
||||||
|
if '.txt' in raw.lower() \
|
||||||
|
or '.csv' in raw.lower():
|
||||||
|
time.sleep(5)
|
||||||
|
request = self.session.get(raw, headers=self.headers)
|
||||||
|
self.soup = BeautifulSoup(request.content, features="lxml")
|
||||||
|
for pre in self.soup.findAll('body'):
|
||||||
|
list = pre.get_text().split('\n')
|
||||||
|
itens.extend(list)
|
||||||
|
|
||||||
|
regex = re.compile(
|
||||||
|
"[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
|
||||||
|
|
||||||
|
for lines in itens:
|
||||||
|
rurls = lines \
|
||||||
|
.replace('\xad', '') \
|
||||||
|
.replace('\n', '') \
|
||||||
|
.replace("http://", '') \
|
||||||
|
.replace("https://", '') \
|
||||||
|
.replace("www.", "")
|
||||||
|
|
||||||
|
url = regex.match(rurls)
|
||||||
|
|
||||||
|
if url is not None:
|
||||||
|
onions.append(self.onion(url=url.group(), source='gist', type='domain'))
|
||||||
|
return onions
|
@ -0,0 +1,120 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__author__ = 'Andrey Glauzer'
|
||||||
|
__license__ = "MIT"
|
||||||
|
__version__ = "1.0.1"
|
||||||
|
__maintainer__ = "Andrey Glauzer"
|
||||||
|
__status__ = "Development"
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
from random import choice
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
class Reddit:
|
||||||
|
def __init__(self):
|
||||||
|
self.session = requests.session()
|
||||||
|
|
||||||
|
self.source = 'Reddit'
|
||||||
|
|
||||||
|
self.url = 'https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000'
|
||||||
|
self.desktop_agents = [
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
|
||||||
|
|
||||||
|
@property
|
||||||
|
def random_headers(self):
|
||||||
|
return {
|
||||||
|
'User-Agent': choice(self.desktop_agents),
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||||
|
}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def start(self):
|
||||||
|
self.reddit_json()
|
||||||
|
|
||||||
|
def reddit_json(self):
|
||||||
|
print('Getting Reddit API information')
|
||||||
|
onionurl = []
|
||||||
|
try:
|
||||||
|
request = self.session.get(self.url, headers=self.random_headers)
|
||||||
|
|
||||||
|
loaded_json = json.loads(request.content)
|
||||||
|
|
||||||
|
print(
|
||||||
|
'Filtering the URLs that have the word .onion in the text')
|
||||||
|
for data in loaded_json['data']:
|
||||||
|
reddit_url = 'https://www.reddit.com{}'.format(
|
||||||
|
data['permalink'])
|
||||||
|
try:
|
||||||
|
request = self.session.get(
|
||||||
|
reddit_url, headers=self.random_headers)
|
||||||
|
soup = BeautifulSoup(request.content, features="lxml")
|
||||||
|
|
||||||
|
for raw in soup.findAll('a', {'rel': 'nofollow'}):
|
||||||
|
if 'https://' in raw['href']:
|
||||||
|
raw_text = self.raw(url=raw['href'])
|
||||||
|
if raw_text is not None:
|
||||||
|
print(
|
||||||
|
'Applying REGEX. Wait...')
|
||||||
|
regex = re.compile(
|
||||||
|
"[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
|
||||||
|
|
||||||
|
for lines in raw_text.split('\n'):
|
||||||
|
rurls = lines \
|
||||||
|
.replace('\xad', '') \
|
||||||
|
.replace('\n', '') \
|
||||||
|
.replace("http://", '') \
|
||||||
|
.replace("https://", '') \
|
||||||
|
.replace(r'\s', '') \
|
||||||
|
.replace('\t', '')
|
||||||
|
|
||||||
|
xurl = regex.match(rurls)
|
||||||
|
if xurl is not None:
|
||||||
|
onionurl.append(xurl.group())
|
||||||
|
|
||||||
|
except(requests.exceptions.ConnectionError,
|
||||||
|
requests.exceptions.ChunkedEncodingError,
|
||||||
|
requests.exceptions.ReadTimeout,
|
||||||
|
requests.exceptions.InvalidURL) as e:
|
||||||
|
print(
|
||||||
|
'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
|
||||||
|
|
||||||
|
except(requests.exceptions.ConnectionError,
|
||||||
|
requests.exceptions.ChunkedEncodingError,
|
||||||
|
requests.exceptions.ReadTimeout,
|
||||||
|
requests.exceptions.InvalidURL) as e:
|
||||||
|
print(
|
||||||
|
'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
|
||||||
|
|
||||||
|
return onionurl
|
||||||
|
|
||||||
|
def raw(self, url):
|
||||||
|
try:
|
||||||
|
if url is not None:
|
||||||
|
request = self.session.get(url, headers=self.random_headers)
|
||||||
|
print(
|
||||||
|
'Connecting in {url} - {status}'.format(url=url, status=request.status_code))
|
||||||
|
|
||||||
|
if request.status_code == 200:
|
||||||
|
|
||||||
|
soup = BeautifulSoup(request.content, features="lxml")
|
||||||
|
for s in soup(['script', 'style']):
|
||||||
|
s.decompose()
|
||||||
|
|
||||||
|
return ' '.join(soup.stripped_strings)
|
||||||
|
|
||||||
|
except (requests.exceptions.ConnectionError,
|
||||||
|
requests.exceptions.ChunkedEncodingError,
|
||||||
|
requests.exceptions.ReadTimeout,
|
||||||
|
requests.exceptions.TooManyRedirects) as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app = Reddit()
|
||||||
|
app.start
|
@ -0,0 +1,31 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__author__ = 'Andrey Glauzer'
|
||||||
|
__license__ = "MIT"
|
||||||
|
__version__ = "1.0.1"
|
||||||
|
__maintainer__ = "Andrey Glauzer"
|
||||||
|
__status__ = "Development"
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from onioningestor.sources import Source
|
||||||
|
|
||||||
|
|
||||||
|
class Plugin(Source):
|
||||||
|
|
||||||
|
def __init__(self, logger, name, filename):
|
||||||
|
self.logger = logger
|
||||||
|
self.name = name
|
||||||
|
self.filename = filename
|
||||||
|
super().__init__(self)
|
||||||
|
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
filepath = Path(__file__).parents[2]/self.filename
|
||||||
|
with open(filepath, 'r') as fp:
|
||||||
|
lines = fp.read().splitlines()
|
||||||
|
for onion in lines:
|
||||||
|
yield self.onion(url=onion,source='simple-file',type='domain')
|
||||||
|
|
Loading…
Reference in New Issue