modified operator init

pull/1/head
danieleperera 4 years ago
parent a6deba300b
commit 7a2e4e2c75

@ -1,7 +1,10 @@
import re import re
import sys import sys
import json import json
from itertools import islice
from datetime import datetime as dt from datetime import datetime as dt
from concurrent.futures import ThreadPoolExecutor
class Operator: class Operator:
"""Base class for all Operator plugins. """Base class for all Operator plugins.
@ -72,18 +75,18 @@ class Operator:
if type == 'URL': if type == 'URL':
blacklist = self.blacklist.findall(response['hiddenService']) blacklist = self.blacklist.findall(response['hiddenService'])
elif type == 'HTML': elif type == 'HTML':
response['simple-html'].pop('status')
response['simple-html']['status'] = 'blocked'
blacklist = self.blacklist.findall(response['simple-html']['HTML']) blacklist = self.blacklist.findall(response['simple-html']['HTML'])
if blacklist: if blacklist:
response['simple-html'].pop('status')
response['simple-html']['status'] = 'blocked'
response['blacklist'] = list(set(blacklist))
self.es.update(db['_id'], response) self.es.update(db['_id'], response)
return False return False
return True return True
def collect(self, onions):
def process(self, onions):
"""Process all applicable onions."""
for onion in onions: for onion in onions:
self.logger.info(f'thread function processing {onion}')
# Add link to database # Add link to database
db = self.es.save({ db = self.es.save({
'hiddenService':onion.url, 'hiddenService':onion.url,
@ -96,3 +99,18 @@ class Operator:
# Get data for current link # Get data for current link
self.handle_onion(db, onion.url) self.handle_onion(db, onion.url)
def iter_batches(self, data, batch_size):
data = iter(data)
while True:
batch = list(islice(data, batch_size))
if len(batch) == 0:
break
yield batch
def process(self, onions):
"""Process all applicable onions."""
#print(onions)
with ThreadPoolExecutor(max_workers=10) as executor:
collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)]
for tasks in collect_tasks:
self.logger.info(tasks.result())

@ -78,6 +78,10 @@ class Plugin(Operator):
result = content.text result = content.text
if result: if result:
html = BeautifulSoup(result,features="lxml") html = BeautifulSoup(result,features="lxml")
# testing hardcorded filepath
with open("/home/tony/Projects/OnionScraper_v2/onion_master_list.txt", "w") as fp:
for onion in re.findall('([a-z2-7]{16,56}\.onion)',result):
fp.write("%s\n" % onion)
if html: if html:
index = { index = {
'HTML':result, 'HTML':result,
@ -114,5 +118,5 @@ class Plugin(Operator):
def handle_onion(self, db, onion): def handle_onion(self, db, onion):
content = self.run_sessions(onion) content = self.run_sessions(onion)
if content[self.plugin_name]['status'] == 'success': if content[self.plugin_name]['status'] == 'success':
if self._onion_is_allowed(db, content): if self._onion_is_allowed(content, db, 'HTML'):
self.es.update(db['_id'], content) self.es.update(db['_id'], content)

@ -7,6 +7,7 @@ __version__ = "1.0.0"
__maintainer__ = "Daniele Perera" __maintainer__ = "Daniele Perera"
__status__ = "Development" __status__ = "Development"
import os
import requests import requests
from pathlib import Path from pathlib import Path
@ -23,10 +24,14 @@ class Plugin(Source):
def run(self): def run(self):
items = []
filepath = Path(__file__).parents[2]/self.filename filepath = Path(__file__).parents[2]/self.filename
with open(filepath, 'r') as fp: with open(filepath, 'r') as fp:
lines = fp.read().splitlines() lines = fp.read().splitlines()
for onion in lines: # just testing
yield self.onion(url=onion,source='simple-file',type='domain')
os.remove(self.filename) os.remove(self.filename)
for onion in lines:
items.append(self.onion(url=onion,source='simple-file',type='domain'))
#yield self.onion(url=onion,source='simple-file',type='domain')
return items

Loading…
Cancel
Save