You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
OnionIngestor/onioningestor/operators/html.py

141 lines
5.5 KiB
Python

import re
import time
import json
import traceback
from datetime import datetime as dt
from json.decoder import JSONDecodeError
import requests
from bs4 import BeautifulSoup
from langdetect import detect
from stem.control import Controller
from stem import Signal
from onioningestor.operators import Operator
class Plugin(Operator):
"""Simple-html
This plugin collects HTML code from onion link
"""
def __init__(self, logger, elasticsearch, allowed_sources, **kwargs):
super(Plugin, self).__init__(logger, elasticsearch, allowed_sources)
self.plugin_name = "simple-html"
self.logger.info(f"Initializing {self.plugin_name}")
self.timeout = int(kwargs["timeout"])
self.retries = int(kwargs["retries"])
interesting = kwargs["interestingKeywords"].split(",")
self.interesting = re.compile(
"|".join([re.escape(word) for word in interesting]), re.IGNORECASE
)
self.proxy = kwargs["socks5"]
self.torControl = kwargs["TorController"]
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
def get_tor_session(self):
try:
s = requests.session()
s.proxies = self.proxy
s.headers.update(self.headers)
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
return s
def renew_connection(self):
with Controller.from_port(port=self.torControl["port"]) as controller:
# Now we switch TOR identities to make sure we have a good connection
self.logger.info("Getting new Tor IP")
# authenticate to our local TOR controller
controller.authenticate(self.torControl["password"])
# send the signal for a new identity
controller.signal(Signal.NEWNYM)
# wait for the new identity to be initialized
time.sleep(controller.get_newnym_wait())
session = self.get_tor_session()
self.logger.info(
f"IP is {session.get('http://httpbin.org/ip').json()['origin']}"
)
def run_sessions(self, onion):
retry = 0
result = None
while True:
try:
url = "http://" + onion
self.logger.info(url)
content = self.get_tor_session().get(url)
if content.status_code == 200:
result = content.text
if result:
html = BeautifulSoup(result, features="lxml")
# testing hardcorded filepath
with open(
"/home/tony/Projects/OnionScraper_v2/onion_master_list.txt",
"w",
) as fp:
for onion in re.findall("([a-z2-7]{16,56}\.onion)", result):
fp.write("%s\n" % onion)
if html:
index = {
"HTML": result,
"title": html.title.text,
"language": detect(html.text),
"date-crawled": dt.utcnow().strftime(
"%Y-%m-%dT%H:%M:%S.%f"
)
+ "Z",
"status": "success",
"interestingKeywords": list(
set(self.interesting.findall(result))
),
}
else:
index = {
"HTML": result,
"title": None,
"language": None,
"date-crawled": dt.utcnow().strftime(
"%Y-%m-%dT%H:%M:%S.%f"
)
+ "Z",
"status": "success",
"interestingKeywords": list(
set(self.interesting.findall(result))
),
}
return self.response(index, onion, self.plugin_name)
except requests.exceptions.ConnectionError as connection_error:
self.logger.error(f"Failed connecting to http://{url}")
self.logger.debug(connection_error)
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
self.logger.info("[x] No results found retrying ...")
retry += 1
self.renew_connection()
if retry > self.retries:
self.logger.error("[x] Max retries exceeded")
return self.response({"status": "failure"}, onion, self.plugin_name)
def handle_onion(self, db, onion):
content = self.run_sessions(onion)
if content[self.plugin_name]["status"] == "success":
if self._onion_is_allowed(content, db, "HTML"):
self.es.update(db["_id"], content)