renamed the package and fixed some issues
parent
23dd01a983
commit
8258056bef
@ -0,0 +1,131 @@
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
import collections
|
||||
|
||||
from . import config
|
||||
from . import dbhandler
|
||||
from . import loghandler
|
||||
|
||||
|
||||
class Ingestor:
|
||||
"""ThreatIngestor main work logic.
|
||||
|
||||
Handles reading the config file, calling sources, maintaining state, and
|
||||
sending artifacts to operators.
|
||||
"""
|
||||
def __init__(self, args):
|
||||
# Load logger
|
||||
log = loghandler.LoggerHandler(args.logLevel)
|
||||
self.logger = log.start_logging()
|
||||
# Load config
|
||||
self.config = config.Config(args.configFile, self.logger)
|
||||
self.blacklist = self.config.blacklist()
|
||||
|
||||
# Load Elasticsearch.
|
||||
try:
|
||||
self.es = dbhandler.DbHandlerElasticSearch(
|
||||
self.config.elasticsearch(),
|
||||
self.logger)
|
||||
except Exception as e:
|
||||
# Error loading elasticsearch.
|
||||
self.logger.error(e)
|
||||
self.logger.debug(traceback.print_exc())
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# Instantiate plugins.
|
||||
try:
|
||||
self.logger.info("Initializing sources")
|
||||
self.sources = {name: source(self.logger, **kwargs)
|
||||
for name, source, kwargs in self.config.sources()}
|
||||
|
||||
self.logger.info("initializing operators")
|
||||
self.operators = {name: operator(self.logger, self.es, self.blacklist, **kwargs)
|
||||
for name, operator, kwargs in self.config.operators()}
|
||||
|
||||
self.logger.info("initializing notifiers")
|
||||
#self.notifiers = {name: operator(**kwargs)
|
||||
# for name, operator, kwargs in self.config.notifiers()}
|
||||
except Exception as e:
|
||||
# Error loading elasticsearch.
|
||||
self.logger.error(e)
|
||||
self.logger.debug(traceback.print_exc())
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def run(self):
|
||||
"""Run once, or forever, depending on config."""
|
||||
if self.config.daemon():
|
||||
self.logger.info("Running forever, in a loop")
|
||||
self.run_forever()
|
||||
else:
|
||||
self.logger.info("Running once, to completion")
|
||||
self.run_once()
|
||||
|
||||
|
||||
def run_once(self):
|
||||
"""Run each source once, passing artifacts to each operator."""
|
||||
# Track some statistics about artifacts in a summary object.
|
||||
summary = collections.Counter()
|
||||
|
||||
for source in self.sources:
|
||||
# Run the source to collect artifacts.
|
||||
self.logger.info(f"Running source '{source}'")
|
||||
try:
|
||||
onions = self.sources[source].run()
|
||||
if onions:
|
||||
self.logger.info(f'Found hidden links')
|
||||
else:
|
||||
self.logger.info('No links found')
|
||||
except Exception as e:
|
||||
self.logger.error(e)
|
||||
self.logger.error(traceback.print_exc())
|
||||
continue
|
||||
|
||||
# Process artifacts with each operator.
|
||||
for operator in self.operators:
|
||||
self.logger.info(f"Processing found onions with operator '{operator}'")
|
||||
try:
|
||||
doc = self.operators[operator].process(onions)
|
||||
# Save the source state.
|
||||
self.es.save(doc)
|
||||
except Exception as e:
|
||||
self.logger.error(e)
|
||||
self.logger.error(traceback.print_exc())
|
||||
continue
|
||||
|
||||
|
||||
|
||||
# # Record stats and update the summary.
|
||||
# types = artifact_types(doc.get('interestingKeywords'))
|
||||
# summary.update(types)
|
||||
# for artifact_type in types:
|
||||
# self.logger.info(f'types[artifact_type]')
|
||||
|
||||
# Log the summary.
|
||||
self.logger.info(f"New artifacts: {dict(summary)}")
|
||||
|
||||
|
||||
def run_forever(self):
|
||||
"""Run forever, sleeping for the configured interval between each run."""
|
||||
while True:
|
||||
self.run_once()
|
||||
|
||||
self.logger.info(f"Sleeping for {self.config.sleep()} seconds")
|
||||
time.sleep(self.config.sleep())
|
||||
|
||||
|
||||
def artifact_types(artifact_list):
|
||||
"""Return a dictionary with counts of each artifact type."""
|
||||
types = {}
|
||||
for artifact in artifact_list:
|
||||
artifact_type = artifact.__class__.__name__.lower()
|
||||
if artifact_type in types:
|
||||
types[artifact_type] += 1
|
||||
else:
|
||||
types[artifact_type] = 1
|
||||
|
||||
return types
|
||||
|
||||
|
@ -0,0 +1,49 @@
|
||||
"""OnionScraper
|
||||
|
||||
A Python3 application for indexing and scraping hidden services ElasticSearch
|
||||
|
||||
Installation:
|
||||
This application assumes you have python3 and pip3 installed.
|
||||
|
||||
pip3 install -r requirements.txt
|
||||
|
||||
|
||||
This software is provided subject to the MIT license stated below.
|
||||
--------------------------------------------------
|
||||
MIT License
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
--------------------------------------------------
|
||||
"""
|
||||
import argparse
|
||||
|
||||
from onioningestor import Ingestor
|
||||
|
||||
|
||||
# Load arguments from user
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='onionscraper',
|
||||
description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
parser.add_argument('-c', '--config',dest="configFile", required = True, help='Path to config file')
|
||||
parser.add_argument("--log", dest="logLevel",default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the logging level, default is INFO")
|
||||
|
||||
args = parser.parse_args()
|
||||
app = Ingestor(args)
|
||||
|
||||
app.run()
|
@ -0,0 +1,172 @@
|
||||
import io
|
||||
import importlib
|
||||
import traceback
|
||||
|
||||
import yaml
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
SOURCE = 'onioningestor.sources'
|
||||
OPERATOR = 'onioningestor.operators'
|
||||
|
||||
INTERNAL_OPTIONS = [
|
||||
'saved_state',
|
||||
'module',
|
||||
'credentials',
|
||||
]
|
||||
|
||||
ARTIFACT_TYPES = 'artifact_types'
|
||||
FILTER_STRING = 'filter'
|
||||
ALLOWED_SOURCES = 'allowed_sources'
|
||||
NAME = 'name'
|
||||
|
||||
|
||||
class Config:
|
||||
"""Config read/write operations, and convenience methods."""
|
||||
def __init__(self, filename, logger):
|
||||
"""Read a config file."""
|
||||
self.logger = logger
|
||||
self.filename = filename
|
||||
with io.open(self.filename, 'r') as f:
|
||||
try:
|
||||
self.logger.info("Loading config file")
|
||||
self.config = yaml.safe_load(f.read())
|
||||
except yaml.error.YAMLError:
|
||||
self.logger.error("YAML error in config")
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _load_plugin(plugin_type, plugin):
|
||||
"""Returns plugin class or raises an exception.
|
||||
:raises: threatingestor.exceptions.PluginError
|
||||
"""
|
||||
try:
|
||||
module = importlib.import_module('.'.join([plugin_type, plugin]))
|
||||
return module.Plugin
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(traceback.print_exc())
|
||||
|
||||
def daemon(self):
|
||||
"""Returns boolean, are we daemonizing?"""
|
||||
return self.config['general']['daemon']
|
||||
|
||||
|
||||
def elasticsearch(self):
|
||||
"""Returns elasticsaerch config"""
|
||||
return self.config['general']['elasticsearch']
|
||||
|
||||
|
||||
def sleep(self):
|
||||
"""Returns number of seconds to sleep between iterations, if daemonizing."""
|
||||
return self.config['general']['sleep']
|
||||
|
||||
def blacklist(self):
|
||||
return self.config['general']['blacklist'].split(',')
|
||||
|
||||
# def onionscanner(self):
|
||||
# """Returns onionscanner config dict"""
|
||||
# screenshots = self.config['onionscanner'].pop('screenshots_path', None)
|
||||
# if screenshots:
|
||||
# self.config['onionscanner']['screenshots_path'] = Path(screenshots)
|
||||
# else:
|
||||
# self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots'
|
||||
# blacklist = self.config['onionscanner'].pop('blacklist', None)
|
||||
# if blacklist:
|
||||
# self.config['onionscanner']['blacklist'] = blacklist.split(',')
|
||||
# interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None)
|
||||
# if interestingKeywords:
|
||||
# self.config['onionscanner']['interestingKeywords'] = blacklist.split(',')
|
||||
# return self.config['onionscanner']
|
||||
|
||||
|
||||
def notifiers(self):
|
||||
"""Returns notifiers config dictionary."""
|
||||
return self.config.get('notifiers', {})
|
||||
|
||||
|
||||
def logging(self):
|
||||
"""Returns logging config dictionary."""
|
||||
return self.config.get('logging', {})
|
||||
|
||||
|
||||
def credentials(self, credential_name):
|
||||
"""Return a dictionary with the specified credentials."""
|
||||
for credential in self.config['credentials']:
|
||||
for key, value in credential.items():
|
||||
if key == NAME and value == credential_name:
|
||||
return credential
|
||||
return {}
|
||||
|
||||
|
||||
def sources(self):
|
||||
"""Return a list of (name, Source class, {kwargs}) tuples.
|
||||
:raises: threatingestor.exceptions.PluginError
|
||||
"""
|
||||
sources = []
|
||||
|
||||
for source in self.config['sources']:
|
||||
kwargs = {}
|
||||
for key, value in source.items():
|
||||
if key not in INTERNAL_OPTIONS:
|
||||
kwargs[key] = value
|
||||
|
||||
elif key == 'credentials':
|
||||
# Grab these named credentials
|
||||
credential_name = value
|
||||
for credential_key, credential_value in self.credentials(credential_name).items():
|
||||
if credential_key != NAME:
|
||||
kwargs[credential_key] = credential_value
|
||||
|
||||
# load and initialize the plugin
|
||||
self.logger.info(f"Found source '{source[NAME]}'")
|
||||
sources.append((source[NAME], self._load_plugin(SOURCE, source['module']), kwargs))
|
||||
|
||||
self.logger.info(f"Found {len(sources)} total sources")
|
||||
return sources
|
||||
|
||||
|
||||
def operators(self):
|
||||
"""Return a list of (name, Operator class, {kwargs}) tuples.
|
||||
:raises: threatingestor.exceptions.PluginError
|
||||
"""
|
||||
operators = []
|
||||
for operator in self.config['operators']:
|
||||
kwargs = {}
|
||||
for key, value in operator.items():
|
||||
if key not in INTERNAL_OPTIONS:
|
||||
if key == ARTIFACT_TYPES:
|
||||
# parse out special artifact_types option
|
||||
artifact_types = []
|
||||
for artifact in value:
|
||||
try:
|
||||
artifact_types.append(threatingestor.artifacts.STRING_MAP[artifact.lower().strip()])
|
||||
except KeyError:
|
||||
# ignore invalid artifact types
|
||||
pass
|
||||
kwargs[key] = artifact_types
|
||||
|
||||
elif key == FILTER_STRING:
|
||||
# pass in special filter_string option
|
||||
kwargs['filter_string'] = value
|
||||
|
||||
elif key == NAME:
|
||||
# exclude name key from operator kwargs, since it's not used
|
||||
pass
|
||||
|
||||
else:
|
||||
kwargs[key] = value
|
||||
|
||||
elif key == 'credentials':
|
||||
# Grab these named credentials
|
||||
credential_name = value
|
||||
for credential_key, credential_value in self.credentials(credential_name).items():
|
||||
if credential_key != NAME:
|
||||
kwargs[credential_key] = credential_value
|
||||
|
||||
# load and initialize the plugin
|
||||
self.logger.info(f"Found operator '{operator[NAME]}'")
|
||||
operators.append((operator[NAME], self._load_plugin(OPERATOR, operator['module']), kwargs))
|
||||
|
||||
self.logger.info(f"Found {len(operators)} total operators")
|
||||
return operators
|
@ -0,0 +1,75 @@
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from elasticsearch import Elasticsearch, helpers
|
||||
|
||||
class DbHandlerElasticSearch:
|
||||
def __init__(self, config, logger):
|
||||
self.logger = logger
|
||||
self.logger.info('Creating Elasticsearch mapping')
|
||||
self.config = config
|
||||
self.mapping = '''
|
||||
{
|
||||
"mappings": {
|
||||
"_doc": {
|
||||
"properties": {
|
||||
"hiddenService": {
|
||||
"type": "text"
|
||||
},
|
||||
"blacklist": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"monitor": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"simple-html": {
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"HTML": {
|
||||
"type": "long"
|
||||
},
|
||||
"title": {
|
||||
"type": "text"
|
||||
},
|
||||
"language": {
|
||||
"type": "text"
|
||||
},
|
||||
"status":{
|
||||
"type":"text"
|
||||
},
|
||||
"date-indexed": {
|
||||
"type": "date"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
'''
|
||||
self.index = self.config['index']
|
||||
try:
|
||||
self.es = Elasticsearch([{
|
||||
'host':self.config['host'],
|
||||
'port':self.config['port']}])
|
||||
self.es.indices.create(
|
||||
index=self.index,
|
||||
body=self.mapping,
|
||||
ignore=400)
|
||||
except Exception as e:
|
||||
self.logger.error(e)
|
||||
self.logger.error(traceback.format_exc())
|
||||
sys.exit(0)
|
||||
|
||||
def count(self):
|
||||
self.es.indices.refresh(self.index)
|
||||
status = self.es.count(index=self.index)
|
||||
if status['_shards']['successful'] == 1:
|
||||
self.logger.info('Successful')
|
||||
self.logger.info('Count:%d',status['count'])
|
||||
else:
|
||||
self.logger.error(status)
|
||||
|
||||
def save(self, doc):
|
||||
self.es.index(index=self.index,body=doc)
|
||||
self.count()
|
@ -0,0 +1,33 @@
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
class LoggerHandler():
|
||||
def __init__(self, level):
|
||||
self.level = getattr(logging, level)
|
||||
self.logger = logging.getLogger("OnionScraper")
|
||||
self.logger.setLevel(self.level)
|
||||
|
||||
# create console handler and set level to debug
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(self.level)
|
||||
|
||||
# create file logging
|
||||
logFile = Path(__file__).parents[1]
|
||||
logging_path = os.path.join(logFile, "info.log")
|
||||
fh = logging.FileHandler(logging_path)
|
||||
|
||||
# create formatter
|
||||
formatter = logging.Formatter('[%(asctime)s] - %(name)s - %(levelname)s - %(message)s',datefmt='%a, %d %b %Y %H:%M:%S')
|
||||
formatter_console = logging.Formatter('[%(asctime)s] - %(levelname)s - %(message)s',datefmt='%d %b %Y %H:%M:%S')
|
||||
# add formatter to ch
|
||||
ch.setFormatter(formatter_console)
|
||||
fh.setFormatter(formatter)
|
||||
# add ch to logger
|
||||
self.logger.addHandler(ch) #added logging into console
|
||||
self.logger.addHandler(fh) #added logging into file
|
||||
|
||||
def start_logging(self):
|
||||
self.logger.info('Starting OnionScraper')
|
||||
return self.logger
|
||||
|
@ -0,0 +1,95 @@
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
|
||||
|
||||
class Operator:
|
||||
"""Base class for all Operator plugins.
|
||||
|
||||
Note: This is an abstract class. You must extend ``__init__`` and call
|
||||
``super`` to ensure this class's constructor is called. You must override
|
||||
``handle_artifact`` with the same signature. You may define additional
|
||||
``handle_{artifact_type}`` methods as needed (see the threatkb operator for
|
||||
an example) - these methods are purely convention, and are not required.
|
||||
|
||||
When adding additional methods to child classes, consider prefixing the
|
||||
method name with an underscore to denote a ``_private_method``. Do not
|
||||
override other existing methods from this class.
|
||||
"""
|
||||
def __init__(self, logger, elasticsearch, allowed_sources=None):
|
||||
"""Override this constructor in child classes.
|
||||
|
||||
The arguments above (artifact_types, filter_string, allowed_sources)
|
||||
should be accepted explicity as above, in all child classes.
|
||||
|
||||
Additional arguments should be added: url, auth, etc, whatever is
|
||||
needed to set up the object.
|
||||
|
||||
Each operator should default self.artifact_types to a list of Artifacts
|
||||
supported by the plugin, and allow passing in artifact_types to
|
||||
overwrite that default.
|
||||
|
||||
Example:
|
||||
|
||||
>>> self.artifact_types = artifact_types or [
|
||||
... artifacts.IPAddress,
|
||||
... artifacts.Domain,
|
||||
... ]
|
||||
|
||||
It's recommended to call this __init__ method via super from all child
|
||||
classes. Remember to do so *before* setting any default artifact_types.
|
||||
"""
|
||||
self.logger = logger
|
||||
self.blacklist = re.compile('|'.join([re.escape(word) for word in allowed_sources]), re.IGNORECASE)
|
||||
self.es = elasticsearch
|
||||
|
||||
def response(self, content, onion, operator_name):
|
||||
"""
|
||||
status: success/failure
|
||||
content: dict
|
||||
onion: str
|
||||
return: dict
|
||||
"""
|
||||
try:
|
||||
return {operator_name: json.loads(str(content)), 'hiddenService': onion}
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
self.logger.info('JosnDecode Error')
|
||||
return {operator_name: content, 'hiddenService': onion}
|
||||
#except TypeError:
|
||||
# return {operator_name: None, 'hiddenService': onion}
|
||||
except Exception as e:
|
||||
self.logger.error(e)
|
||||
|
||||
def handle_onion(self, url):
|
||||
"""Override with the same signature.
|
||||
|
||||
:param artifact: A single ``Artifact`` object.
|
||||
:returns: None (always ignored)
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def _onion_is_allowed(self, response, type='URL'):
|
||||
"""Returns True if this is allowed by this plugin's filters."""
|
||||
# Must be in allowed_sources, if set.
|
||||
if type == 'URL':
|
||||
print(response)
|
||||
blacklist = self.blacklist.findall(response['hiddenService'])
|
||||
elif type == 'HTML':
|
||||
response['simple-html'].pop('status')
|
||||
response['simple-html']['status'] = 'blocked'
|
||||
blacklist = self.blacklist.findall(response['simple-html']['HTML'])
|
||||
if blacklist:
|
||||
self.es.save(response)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def process(self, onions):
|
||||
"""Process all applicable onions."""
|
||||
for onion in onions:
|
||||
if self._onion_is_allowed(
|
||||
self.response({'status':'blocked'},onion.url,'regex-blacklist'),
|
||||
type='URL'):
|
||||
self.handle_onion(onion.url)
|
||||
|
@ -0,0 +1,99 @@
|
||||
import time
|
||||
import json
|
||||
import traceback
|
||||
from datetime import datetime as dt
|
||||
from json.decoder import JSONDecodeError
|
||||
|
||||
import requests
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from langdetect import detect
|
||||
|
||||
from stem.control import Controller
|
||||
from stem import Signal
|
||||
|
||||
from onioningestor.operators import Operator
|
||||
|
||||
|
||||
class Plugin(Operator):
|
||||
"""Simple-html
|
||||
This plugin collects HTML code from onion link
|
||||
"""
|
||||
|
||||
def __init__(self, logger, elasticsearch, allowed_sources, **kwargs):
|
||||
super(Plugin, self).__init__(logger, elasticsearch, allowed_sources)
|
||||
self.plugin_name = 'simple-html'
|
||||
self.logger.info(f"Initializing {self.plugin_name}")
|
||||
|
||||
self.timeout = int(kwargs['timeout'])
|
||||
self.retries = int(kwargs['retries'])
|
||||
|
||||
self.proxy = kwargs['socks5']
|
||||
self.torControl = kwargs['TorController']
|
||||
self.headers ={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language':'en-US,en;q=0.5',
|
||||
'DNT': '1', 'Connection':
|
||||
'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1'}
|
||||
|
||||
def get_tor_session(self):
|
||||
try:
|
||||
s = requests.session()
|
||||
s.proxies = self.proxy
|
||||
s.headers.update(self.headers)
|
||||
except Exception as e:
|
||||
self.logger.error(e)
|
||||
self.logger.debug(traceback.print_exc())
|
||||
return s
|
||||
|
||||
def renew_connection(self):
|
||||
with Controller.from_port(port = self.torControl['port']) as controller:
|
||||
# Now we switch TOR identities to make sure we have a good connection
|
||||
self.logger.info('Getting new Tor IP')
|
||||
# authenticate to our local TOR controller
|
||||
controller.authenticate(self.torControl['password'])
|
||||
# send the signal for a new identity
|
||||
controller.signal(Signal.NEWNYM)
|
||||
# wait for the new identity to be initialized
|
||||
time.sleep(controller.get_newnym_wait())
|
||||
session = self.get_tor_session()
|
||||
self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
|
||||
|
||||
def run_sessions(self, onion):
|
||||
retry = 0
|
||||
result = None
|
||||
while True:
|
||||
try:
|
||||
url = 'http://'+onion
|
||||
self.logger.info(url)
|
||||
content = self.get_tor_session().get(url)
|
||||
if content.status_code == 200:
|
||||
result = content.text
|
||||
if result:
|
||||
html = BeautifulSoup(result,features="lxml")
|
||||
index = {'HTML':result,'title':html.title.text,'language':detect(html.text),'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z','status':'success'}
|
||||
return self.response(index, onion, self.plugin_name)
|
||||
except requests.exceptions.ConnectionError as connection_error:
|
||||
self.logger.error(f'Failed connecting to http://{url}')
|
||||
self.logger.debug(connection_error)
|
||||
except Exception as e:
|
||||
self.logger.error(e)
|
||||
self.logger.debug(traceback.print_exc())
|
||||
|
||||
self.logger.info('[x] No results found retrying ...')
|
||||
retry += 1
|
||||
self.renew_connection()
|
||||
if retry > self.retries:
|
||||
self.logger.error('[x] Max retries exceeded')
|
||||
return self.response({'status':"failure"}, onion, self.plugin_name)
|
||||
|
||||
def handle_onion(self, onion):
|
||||
content = self.run_sessions(onion)
|
||||
print(content)
|
||||
if content[self.plugin_name]['status'] == 'success':
|
||||
if self._onion_is_allowed(content):
|
||||
self.es.save(content)
|
||||
|
@ -0,0 +1,264 @@
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import traceback
|
||||
import subprocess
|
||||
from uuid import uuid4
|
||||
from pathlib import Path
|
||||
from datetime import datetime as dt
|
||||
from json.decoder import JSONDecodeError
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from threading import Timer
|
||||
|
||||
import requests
|
||||
|
||||
from stem.control import Controller
|
||||
from stem import Signal
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
|
||||
|
||||
from onionscraper.operators import Operator
|
||||
|
||||
class Plugin(Operator):
|
||||
"""OnionScraper main work logic.
|
||||
|
||||
Handles reading the config file, calling sources, maintaining state and
|
||||
sending artifacts to operators.
|
||||
"""
|
||||
def __init__(self, logger, **kwargs):
|
||||
self.logger = logger
|
||||
self.logger.info('Initializing OnionScanner')
|
||||
screenshots = kwargs.pop('screenshots_path', None)
|
||||
if screenshots:
|
||||
self.screenshots = Path(screenshots)
|
||||
else:
|
||||
self.screenshots = Path(__file__).parents[1]/'screenshots'
|
||||
self.onionscan = kwargs['binpath']
|
||||
self.timeout = int(kwargs['timeout'])
|
||||
self.proxy = kwargs['socks5']
|
||||
self.torControl = kwargs['TorController']
|
||||
self.retries = int(kwargs['retries'])
|
||||
self.headers ={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language':'en-US,en;q=0.5',
|
||||
'DNT': '1', 'Connection':
|
||||
'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1'}
|
||||
|
||||
|
||||
blacklist = kwargs['blacklist'].split(',')
|
||||
self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE)
|
||||
keywords = kwargs['interestingKeywords'].split(',')
|
||||
self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE)
|
||||
self.session = self.get_tor_session()
|
||||
|
||||
def response(self, status, content, onion):
|
||||
"""
|
||||
status: success/failure
|
||||
content: dict
|
||||
onion: str
|
||||
return: dict
|
||||
"""
|
||||
return {'status': status, 'data': content, 'onion': onion}
|
||||
|
||||
def parseDoc(self, data):
|
||||
data['onionscan'].pop('simpleReport', None)
|
||||
crawls = data['onionscan'].pop('crawls', None)
|
||||
hiddenService = data['onionscan'].pop('hiddenService', None)
|
||||
data['onionscan']['crawls'] = [*crawls]
|
||||
data['hiddenService'] = hiddenService
|
||||
for onion in crawls.keys():
|
||||
print(onion)
|
||||
with open('/home/tony/Projects/OnionScraper_v2/onion_master_list.txt', 'a') as fp:
|
||||
fp.write("%s\n" % onion)
|
||||
#q.enqueue(self.crawl, onion)
|
||||
#with open('test.json', 'w', encoding='utf-8') as f:
|
||||
# json.dump(data, f, ensure_ascii=False, indent=4)
|
||||
return data
|
||||
|
||||
def format_directory(self, directory):
|
||||
d = dt.now()
|
||||
year = str(d.year)
|
||||
month = str(d.month)
|
||||
# prefix month and day with "0" if it is only one digit
|
||||
if len(month) < 2:
|
||||
month = "0" + month
|
||||
day = str(d.day)
|
||||
if len(day) < 2:
|
||||
day = "0" + day
|
||||
save_path = directory/year/month/day
|
||||
if not os.path.isdir(save_path):
|
||||
self.logger.info("[*] Creating directory to save screenshots")
|
||||
os.makedirs(save_path)
|
||||
|
||||
return save_path
|
||||
|
||||
def take_screenshot(self, save_path, onion):
|
||||
binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver')
|
||||
fp = webdriver.FirefoxProfile()
|
||||
fp.set_preference('network.proxy.type', 1)
|
||||
fp.set_preference('network.proxy.socks', '127.0.0.1')
|
||||
fp.set_preference('network.proxy.socks_port', 9050)
|
||||
fp.set_preference('network.proxy.socks_remote_dns', True)
|
||||
|
||||
options = Options()
|
||||
options.headless = True
|
||||
driver = webdriver.Firefox(
|
||||
executable_path='/home/tony/Projects/OnionScraper/geckodriver',
|
||||
options=options,
|
||||
firefox_profile=fp)
|
||||
url = 'http://' + onion
|
||||
driver.get(url)
|
||||
uid = str(uuid4()).split('-')[0]
|
||||
filename = f"{onion}_screenshot_{uid}.png"
|
||||
f_name = f"{save_path}/{filename}"
|
||||
driver.save_screenshot(f_name)
|
||||
|
||||
driver.quit()
|
||||
|
||||
if os.path.isfile(f_name):
|
||||
self.logger.info(f'[*] Screenshot was taken. {f_name}')
|
||||
dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'
|
||||
result = {'dateScreenshoted':dateScreenshoted,'filename':filename}
|
||||
return self.response("success",result,onion)
|
||||
else:
|
||||
self.logger.error('[x] Unable to take screenshot')
|
||||
return self.response("failure",None,onion)
|
||||
|
||||
|
||||
|
||||
def get_tor_session(self):
|
||||
try:
|
||||
s = requests.session()
|
||||
s.proxies = self.proxy
|
||||
s.headers.update(self.headers)
|
||||
except Exception as e:
|
||||
self.logger.error(e)
|
||||
self.logger.debug(traceback.print_exc())
|
||||
return s
|
||||
|
||||
# signal TOR for a new connection
|
||||
def renew_connection(self):
|
||||
with Controller.from_port(port = self.torControl['port']) as controller:
|
||||
# Now we switch TOR identities to make sure we have a good connection
|
||||
self.logger.info('Getting new Tor IP')
|
||||
# authenticate to our local TOR controller
|
||||
controller.authenticate(self.torControl['password'])
|
||||
# send the signal for a new identity
|
||||
controller.signal(Signal.NEWNYM)
|
||||
# wait for the new identity to be initialized
|
||||
time.sleep(controller.get_newnym_wait())
|
||||
session = self.get_tor_session()
|
||||
self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
|
||||
|
||||
def handle_timeout(self, process, onion):
|
||||
#
|
||||
# Handle a timeout from the onionscan process.
|
||||
#
|
||||
|
||||
try:
|
||||
# kill the onionscan process
|
||||
process.kill()
|
||||
self.logger.info("[!!!] Killed the onionscan process.")
|
||||
except:
|
||||
pass
|
||||
self.renew_connection()
|
||||
return
|
||||
|
||||
def run_sessions(self, onion):
|
||||
retry = 0
|
||||
result = None
|
||||
while True:
|
||||
try:
|
||||
url = 'http://'+onion
|
||||
self.logger.info(url)
|
||||
content = self.session.get(url)
|
||||
if content.status_code == 200:
|
||||
result = content.json()
|
||||
except JSONDecodeError as e:
|
||||
self.logger.debug(f'JSONDecodeError {e}')
|
||||
result = content.text
|
||||
except Exception as e:
|
||||
self.logger.error(e)
|
||||
self.logger.debug(traceback.print_exc())
|
||||
finally:
|
||||
if result:
|
||||
return self.response("success",result,onion)
|
||||
else:
|
||||
self.logger.info('[x] No results found retrying ...')
|
||||
retry += 1
|
||||
self.renew_connection()
|
||||
if retry > self.retries:
|
||||
self.logger.error('[x] Max retries exceeded')
|
||||
return self.response("failure",None, onion)
|
||||
|
||||
def run_onionscan(self, onion):
|
||||
self.logger.info("[*] Running onionscan on %s", onion)
|
||||
|
||||
# fire up onionscan
|
||||
process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
|
||||
|
||||
# start the timer and let it run till timeout minutes
|
||||
process_timer = Timer(300,self.handle_timeout,args=[process,onion])
|
||||
process_timer.start()
|
||||
|
||||
# wait for the onion scan results
|
||||
stdout = process.communicate()[0]
|
||||
|
||||
# we have received valid results so we can kill the timer
|
||||
if process_timer.is_alive():
|
||||
process_timer.cancel()
|
||||
try:
|
||||
return self.response("success",json.loads(stdout),onion)
|
||||
except json.decoder.JSONDecodeError:
|
||||
pass
|
||||
|
||||
self.logger.info("[!!!] Process timed out for %s", onion)
|
||||
|
||||
return self.response("failure",None, onion)
|
||||
|
||||
def handle_onion(self, onion_tuple):
|
||||
onion = onion_tuple.url
|
||||
self.logger.info(f'Processing {onion} with onionscan')
|
||||
try:
|
||||
blacklist_URL = self.blacklist.search(onion)
|
||||
if blacklist_URL:
|
||||
self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}")
|
||||
else:
|
||||
self.logger.debug("[*] URL blacklist test: PASSED")
|
||||
results = self.run_onionscan(onion)
|
||||
if results['status'] == 'success':# and results['data']['webDetected'] == 'true':
|
||||
content = self.run_sessions(onion)
|
||||
if content['status'] == 'success':
|
||||
blacklist_CONTENT = self.blacklist.search(content['data'])
|
||||
if blacklist_CONTENT:
|
||||
self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}")
|
||||
else:
|
||||
self.logger.debug("[*] CONTENT blacklist test: PASSED")
|
||||
screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion)
|
||||
self.logger.info("Indexing!")
|
||||
doc = {
|
||||
'onionscan':json.loads(results['data']),
|
||||
'html':content['data'],
|
||||
'screenshots':screenshot['data'],
|
||||
'interestingKeywords':self.interestingKeywords.findall(content['data'])
|
||||
}
|
||||
return self.parseDoc(doc)
|
||||
|
||||
else:
|
||||
self.logger.info(f"[x] hidden service {onion} is not active")
|
||||
except Exception as e:
|
||||
self.logger.error(e)
|
||||
self.logger.error(traceback.print_exc())
|
||||
finally:
|
||||
pass
|
||||
#sys.exit(0)
|
||||
|
||||
|
||||
|
@ -0,0 +1,15 @@
|
||||
|
||||
from onionscraper.operators import Operator
|
||||
|
||||
class Plugin(Operator):
|
||||
"""Operator for output to flat CSV file."""
|
||||
def __init__(self, filename, base_score):
|
||||
"""CSV operator."""
|
||||
self.filename = filename
|
||||
|
||||
#super(Plugin, self).__init__(artifact_types, filter_string, allowed_sources)
|
||||
|
||||
|
||||
def handle_artifact(self, artifact):
|
||||
"""Operate on a single artifact."""
|
||||
pass
|
@ -0,0 +1,41 @@
|
||||
from collections import namedtuple
|
||||
|
||||
class Source(object):
|
||||
"""Base class for all Source plugins.
|
||||
Note: This is an abstract class. You must override ``__init__`` and ``run``
|
||||
in child classes. You should not override ``process_element``. When adding
|
||||
additional methods to child classes, consider prefixing the method name
|
||||
with an underscore to denote a ``_private_method``.
|
||||
"""
|
||||
def __init__(self, name, *args, **kwargs):
|
||||
"""Override this constructor in child classes.
|
||||
The first argument must always be ``name``.
|
||||
Other argumentss should be url, auth, etc, whatever is needed to set
|
||||
up the object.
|
||||
"""
|
||||
self.onion = namedtuple('onion', ['url','source','type'])
|
||||
|
||||
|
||||
def run(self):
|
||||
"""Run and return ``(saved_state, list(Artifact))``.
|
||||
Override this method in child classes.
|
||||
The method signature and return values must remain consistent.
|
||||
The method should attempt to pick up where we left off using
|
||||
``saved_state``, if supported. If ``saved_state`` is ``None``, you can
|
||||
assume this is a first run. If state is maintained by the remote
|
||||
resource (e.g. as it is with SQS), ``saved_state`` should always be
|
||||
``None``.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def process_element(self, content, reference_link, include_nonobfuscated=False):
|
||||
"""Take a single source content/url and return a list of Artifacts.
|
||||
This is the main work block of Source plugins, which handles
|
||||
IOC extraction and artifact creation.
|
||||
:param content: String content to extract from.
|
||||
:param reference_link: Reference link to attach to all artifacts.
|
||||
:param include_nonobfuscated: Include non-defanged URLs in output?
|
||||
"""
|
||||
logger.debug(f"Processing in source '{self.name}'")
|
||||
|
@ -0,0 +1,153 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__author__ = 'Andrey Glauzer'
|
||||
__license__ = "MIT"
|
||||
__version__ = "1.0.1"
|
||||
__maintainer__ = "Andrey Glauzer"
|
||||
__status__ = "Development"
|
||||
|
||||
import requests
|
||||
import json
|
||||
import re
|
||||
import re
|
||||
import urllib.parse
|
||||
from random import choice
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
from onionscraper.sources import Source
|
||||
|
||||
|
||||
class Plugin(Source):
|
||||
|
||||
def __init__(self, logger, name, url):
|
||||
self.logger = logger
|
||||
self.name = name
|
||||
self.url = url
|
||||
self.desktop_agents = [
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
|
||||
super().__init__(self)
|
||||
|
||||
|
||||
def run(self):
|
||||
self.logger.info('Starting Gist Scraper')
|
||||
self.cookies()
|
||||
self.pagination()
|
||||
self.scraping()
|
||||
return self.raw()
|
||||
|
||||
@property
|
||||
def random_headers(self):
|
||||
return {
|
||||
'User-Agent': choice(self.desktop_agents),
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
}
|
||||
|
||||
def cookies(self):
|
||||
|
||||
self.logger.info('Setting GIST cookies')
|
||||
|
||||
with requests.Session() as self.session:
|
||||
self.headers = self.random_headers
|
||||
|
||||
request = self.session.get(self.url, headers=self.headers)
|
||||
|
||||
if request.status_code == 200:
|
||||
pass
|
||||
else:
|
||||
self.logger.error('No Response from GIST')
|
||||
|
||||
def pagination(self):
|
||||
request = self.session.get(
|
||||
f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)
|
||||
self.soup = BeautifulSoup(request.content, features="lxml")
|
||||
|
||||
pages = []
|
||||
self.urls = [self.url]
|
||||
try:
|
||||
for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):
|
||||
pages.append(pagination.get_text())
|
||||
except:
|
||||
pages = False
|
||||
|
||||
if pages:
|
||||
cont = 2
|
||||
while cont <= 1: # int(pages[-2]):
|
||||
cont += 1
|
||||
full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"
|
||||
self.urls.append(full_url)
|
||||
|
||||
def scraping(self):
|
||||
url = []
|
||||
for inurl in self.urls:
|
||||
self.logger.info(f"Connecting to {inurl}")
|
||||
time.sleep(5)
|
||||
request = self.session.get(inurl, headers=self.headers)
|
||||
|
||||
if request.status_code == 200:
|
||||
soup = BeautifulSoup(request.content, features="lxml")
|
||||
for code in soup.findAll('div', {'class': 'gist-snippet'}):
|
||||
if '.onion' in code.get_text().lower():
|
||||
for raw in code.findAll('a', {'class': 'link-overlay'}):
|
||||
try:
|
||||
url.append(raw['href'])
|
||||
except:
|
||||
pass
|
||||
self.urls_raw = []
|
||||
for get in url:
|
||||
self.logger.info(f"Connecting to {get}")
|
||||
time.sleep(5)
|
||||
try:
|
||||
request = self.session.get(get, headers=self.headers)
|
||||
|
||||
if request.status_code == 200:
|
||||
soup = BeautifulSoup(request.content, features="lxml")
|
||||
|
||||
for raw in soup.findAll('a', {'class': 'btn btn-sm'}):
|
||||
try:
|
||||
gist_url = f"https://gist.githubusercontent.com{raw['href']}"
|
||||
|
||||
self.urls_raw.append(gist_url)
|
||||
|
||||
except:
|
||||
pass
|
||||
except(requests.exceptions.ConnectionError,
|
||||
requests.exceptions.ChunkedEncodingError,
|
||||
requests.exceptions.ReadTimeout,
|
||||
requests.exceptions.InvalidURL) as e:
|
||||
self.logger.error(
|
||||
f"I was unable to connect to the url, because an error occurred.\n{e}")
|
||||
pass
|
||||
|
||||
def raw(self):
|
||||
self.logger.info('Performing replaces and regex. WAIT...')
|
||||
itens = []
|
||||
onions = []
|
||||
for raw in self.urls_raw:
|
||||
if '.txt' in raw.lower() \
|
||||
or '.csv' in raw.lower():
|
||||
time.sleep(5)
|
||||
request = self.session.get(raw, headers=self.headers)
|
||||
self.soup = BeautifulSoup(request.content, features="lxml")
|
||||
for pre in self.soup.findAll('body'):
|
||||
list = pre.get_text().split('\n')
|
||||
itens.extend(list)
|
||||
|
||||
regex = re.compile(
|
||||
"[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
|
||||
|
||||
for lines in itens:
|
||||
rurls = lines \
|
||||
.replace('\xad', '') \
|
||||
.replace('\n', '') \
|
||||
.replace("http://", '') \
|
||||
.replace("https://", '') \
|
||||
.replace("www.", "")
|
||||
|
||||
url = regex.match(rurls)
|
||||
|
||||
if url is not None:
|
||||
onions.append(self.onion(url=url.group(), source='gist', type='domain'))
|
||||
return onions
|
@ -0,0 +1,153 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__author__ = 'Daneiele Perera'
|
||||
__license__ = "MIT"
|
||||
__version__ = "1.0.1"
|
||||
__maintainer__ = "Daniele Perera"
|
||||
__status__ = "Development"
|
||||
|
||||
import requests
|
||||
import json
|
||||
import re
|
||||
import re
|
||||
import urllib.parse
|
||||
from random import choice
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
from onionscraper.sources import Source
|
||||
|
||||
|
||||
class Plugin(Source):
|
||||
|
||||
def __init__(self, logger, name, url):
|
||||
self.logger = logger
|
||||
self.name = name
|
||||
self.url = url
|
||||
self.desktop_agents = [
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
|
||||
super().__init__(self)
|
||||
|
||||
|
||||
def run(self):
|
||||
self.logger.info('Starting Gist Scraper')
|
||||
self.cookies()
|
||||
self.pagination()
|
||||
self.scraping()
|
||||
return self.raw()
|
||||
|
||||
@property
|
||||
def random_headers(self):
|
||||
return {
|
||||
'User-Agent': choice(self.desktop_agents),
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
}
|
||||
|
||||
def cookies(self):
|
||||
|
||||
self.logger.info('Setting GIST cookies')
|
||||
|
||||
with requests.Session() as self.session:
|
||||
self.headers = self.random_headers
|
||||
|
||||
request = self.session.get(self.url, headers=self.headers)
|
||||
|
||||
if request.status_code == 200:
|
||||
pass
|
||||
else:
|
||||
self.logger.error('No Response from GIST')
|
||||
|
||||
def pagination(self):
|
||||
request = self.session.get(
|
||||
f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)
|
||||
self.soup = BeautifulSoup(request.content, features="lxml")
|
||||
|
||||
pages = []
|
||||
self.urls = [self.url]
|
||||
try:
|
||||
for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):
|
||||
pages.append(pagination.get_text())
|
||||
except:
|
||||
pages = False
|
||||
|
||||
if pages:
|
||||
cont = 2
|
||||
while cont <= 1: # int(pages[-2]):
|
||||
cont += 1
|
||||
full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"
|
||||
self.urls.append(full_url)
|
||||
|
||||
def scraping(self):
|
||||
url = []
|
||||
for inurl in self.urls:
|
||||
self.logger.info(f"Connecting to {inurl}")
|
||||
time.sleep(5)
|
||||
request = self.session.get(inurl, headers=self.headers)
|
||||
|
||||
if request.status_code == 200:
|
||||
soup = BeautifulSoup(request.content, features="lxml")
|
||||
for code in soup.findAll('div', {'class': 'gist-snippet'}):
|
||||
if '.onion' in code.get_text().lower():
|
||||
for raw in code.findAll('a', {'class': 'link-overlay'}):
|
||||
try:
|
||||
url.append(raw['href'])
|
||||
except:
|
||||
pass
|
||||
self.urls_raw = []
|
||||
for get in url:
|
||||
self.logger.info(f"Connecting to {get}")
|
||||
time.sleep(5)
|
||||
try:
|
||||
request = self.session.get(get, headers=self.headers)
|
||||
|
||||
if request.status_code == 200:
|
||||
soup = BeautifulSoup(request.content, features="lxml")
|
||||
|
||||
for raw in soup.findAll('a', {'class': 'btn btn-sm'}):
|
||||
try:
|
||||
gist_url = f"https://gist.githubusercontent.com{raw['href']}"
|
||||
|
||||
self.urls_raw.append(gist_url)
|
||||
|
||||
except:
|
||||
pass
|
||||
except(requests.exceptions.ConnectionError,
|
||||
requests.exceptions.ChunkedEncodingError,
|
||||
requests.exceptions.ReadTimeout,
|
||||
requests.exceptions.InvalidURL) as e:
|
||||
self.logger.error(
|
||||
f"I was unable to connect to the url, because an error occurred.\n{e}")
|
||||
pass
|
||||
|
||||
def raw(self):
|
||||
self.logger.info('Performing replaces and regex. WAIT...')
|
||||
itens = []
|
||||
onions = []
|
||||
for raw in self.urls_raw:
|
||||
if '.txt' in raw.lower() \
|
||||
or '.csv' in raw.lower():
|
||||
time.sleep(5)
|
||||
request = self.session.get(raw, headers=self.headers)
|
||||
self.soup = BeautifulSoup(request.content, features="lxml")
|
||||
for pre in self.soup.findAll('body'):
|
||||
list = pre.get_text().split('\n')
|
||||
itens.extend(list)
|
||||
|
||||
regex = re.compile(
|
||||
"[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
|
||||
|
||||
for lines in itens:
|
||||
rurls = lines \
|
||||
.replace('\xad', '') \
|
||||
.replace('\n', '') \
|
||||
.replace("http://", '') \
|
||||
.replace("https://", '') \
|
||||
.replace("www.", "")
|
||||
|
||||
url = regex.match(rurls)
|
||||
|
||||
if url is not None:
|
||||
onions.append(self.onion(url=url.group(), source='gist', type='domain'))
|
||||
return onions
|
@ -0,0 +1,120 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__author__ = 'Andrey Glauzer'
|
||||
__license__ = "MIT"
|
||||
__version__ = "1.0.1"
|
||||
__maintainer__ = "Andrey Glauzer"
|
||||
__status__ = "Development"
|
||||
|
||||
import requests
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
import re
|
||||
import urllib.parse
|
||||
from random import choice
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class Reddit:
|
||||
def __init__(self):
|
||||
self.session = requests.session()
|
||||
|
||||
self.source = 'Reddit'
|
||||
|
||||
self.url = 'https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000'
|
||||
self.desktop_agents = [
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
|
||||
|
||||
@property
|
||||
def random_headers(self):
|
||||
return {
|
||||
'User-Agent': choice(self.desktop_agents),
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
}
|
||||
|
||||
@property
|
||||
def start(self):
|
||||
self.reddit_json()
|
||||
|
||||
def reddit_json(self):
|
||||
print('Getting Reddit API information')
|
||||
onionurl = []
|
||||
try:
|
||||
request = self.session.get(self.url, headers=self.random_headers)
|
||||
|
||||
loaded_json = json.loads(request.content)
|
||||
|
||||
print(
|
||||
'Filtering the URLs that have the word .onion in the text')
|
||||
for data in loaded_json['data']:
|
||||
reddit_url = 'https://www.reddit.com{}'.format(
|
||||
data['permalink'])
|
||||
try:
|
||||
request = self.session.get(
|
||||
reddit_url, headers=self.random_headers)
|
||||
soup = BeautifulSoup(request.content, features="lxml")
|
||||
|
||||
for raw in soup.findAll('a', {'rel': 'nofollow'}):
|
||||
if 'https://' in raw['href']:
|
||||
raw_text = self.raw(url=raw['href'])
|
||||
if raw_text is not None:
|
||||
print(
|
||||
'Applying REGEX. Wait...')
|
||||
regex = re.compile(
|
||||
"[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
|
||||
|
||||
for lines in raw_text.split('\n'):
|
||||
rurls = lines \
|
||||
.replace('\xad', '') \
|
||||
.replace('\n', '') \
|
||||
.replace("http://", '') \
|
||||
.replace("https://", '') \
|
||||
.replace(r'\s', '') \
|
||||
.replace('\t', '')
|
||||
|
||||
xurl = regex.match(rurls)
|
||||
if xurl is not None:
|
||||
onionurl.append(xurl.group())
|
||||
|
||||
except(requests.exceptions.ConnectionError,
|
||||
requests.exceptions.ChunkedEncodingError,
|
||||
requests.exceptions.ReadTimeout,
|
||||
requests.exceptions.InvalidURL) as e:
|
||||
print(
|
||||
'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
|
||||
|
||||
except(requests.exceptions.ConnectionError,
|
||||
requests.exceptions.ChunkedEncodingError,
|
||||
requests.exceptions.ReadTimeout,
|
||||
requests.exceptions.InvalidURL) as e:
|
||||
print(
|
||||
'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
|
||||
|
||||
return onionurl
|
||||
|
||||
def raw(self, url):
|
||||
try:
|
||||
if url is not None:
|
||||
request = self.session.get(url, headers=self.random_headers)
|
||||
print(
|
||||
'Connecting in {url} - {status}'.format(url=url, status=request.status_code))
|
||||
|
||||
if request.status_code == 200:
|
||||
|
||||
soup = BeautifulSoup(request.content, features="lxml")
|
||||
for s in soup(['script', 'style']):
|
||||
s.decompose()
|
||||
|
||||
return ' '.join(soup.stripped_strings)
|
||||
|
||||
except (requests.exceptions.ConnectionError,
|
||||
requests.exceptions.ChunkedEncodingError,
|
||||
requests.exceptions.ReadTimeout,
|
||||
requests.exceptions.TooManyRedirects) as e:
|
||||
pass
|
||||
|
||||
if __name__ == '__main__':
|
||||
app = Reddit()
|
||||
app.start
|
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__author__ = 'Andrey Glauzer'
|
||||
__license__ = "MIT"
|
||||
__version__ = "1.0.1"
|
||||
__maintainer__ = "Andrey Glauzer"
|
||||
__status__ = "Development"
|
||||
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
from onioningestor.sources import Source
|
||||
|
||||
|
||||
class Plugin(Source):
|
||||
|
||||
def __init__(self, logger, name, filename):
|
||||
self.logger = logger
|
||||
self.name = name
|
||||
self.filename = filename
|
||||
super().__init__(self)
|
||||
|
||||
|
||||
def run(self):
|
||||
filepath = Path(__file__).parents[2]/self.filename
|
||||
with open(filepath, 'r') as fp:
|
||||
lines = fp.read().splitlines()
|
||||
for onion in lines:
|
||||
yield self.onion(url=onion,source='simple-file',type='domain')
|
||||
|
Loading…
Reference in New Issue