modified operator init

pull/1/head
danieleperera 4 years ago
parent a6deba300b
commit 7a2e4e2c75

@ -1,7 +1,10 @@
import re
import sys
import json
from itertools import islice
from datetime import datetime as dt
from concurrent.futures import ThreadPoolExecutor
class Operator:
"""Base class for all Operator plugins.
@ -72,18 +75,18 @@ class Operator:
if type == 'URL':
blacklist = self.blacklist.findall(response['hiddenService'])
elif type == 'HTML':
response['simple-html'].pop('status')
response['simple-html']['status'] = 'blocked'
blacklist = self.blacklist.findall(response['simple-html']['HTML'])
if blacklist:
response['simple-html'].pop('status')
response['simple-html']['status'] = 'blocked'
response['blacklist'] = list(set(blacklist))
self.es.update(db['_id'], response)
return False
return True
def process(self, onions):
"""Process all applicable onions."""
def collect(self, onions):
for onion in onions:
self.logger.info(f'thread function processing {onion}')
# Add link to database
db = self.es.save({
'hiddenService':onion.url,
@ -96,3 +99,18 @@ class Operator:
# Get data for current link
self.handle_onion(db, onion.url)
def iter_batches(self, data, batch_size):
data = iter(data)
while True:
batch = list(islice(data, batch_size))
if len(batch) == 0:
break
yield batch
def process(self, onions):
"""Process all applicable onions."""
#print(onions)
with ThreadPoolExecutor(max_workers=10) as executor:
collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)]
for tasks in collect_tasks:
self.logger.info(tasks.result())

@ -78,6 +78,10 @@ class Plugin(Operator):
result = content.text
if result:
html = BeautifulSoup(result,features="lxml")
# testing hardcorded filepath
with open("/home/tony/Projects/OnionScraper_v2/onion_master_list.txt", "w") as fp:
for onion in re.findall('([a-z2-7]{16,56}\.onion)',result):
fp.write("%s\n" % onion)
if html:
index = {
'HTML':result,
@ -114,5 +118,5 @@ class Plugin(Operator):
def handle_onion(self, db, onion):
content = self.run_sessions(onion)
if content[self.plugin_name]['status'] == 'success':
if self._onion_is_allowed(db, content):
if self._onion_is_allowed(content, db, 'HTML'):
self.es.update(db['_id'], content)

@ -7,6 +7,7 @@ __version__ = "1.0.0"
__maintainer__ = "Daniele Perera"
__status__ = "Development"
import os
import requests
from pathlib import Path
@ -23,10 +24,14 @@ class Plugin(Source):
def run(self):
items = []
filepath = Path(__file__).parents[2]/self.filename
with open(filepath, 'r') as fp:
lines = fp.read().splitlines()
for onion in lines:
yield self.onion(url=onion,source='simple-file',type='domain')
# just testing
os.remove(self.filename)
for onion in lines:
items.append(self.onion(url=onion,source='simple-file',type='domain'))
#yield self.onion(url=onion,source='simple-file',type='domain')
return items

Loading…
Cancel
Save