Initial commit

This commit is contained in:
Your Name
2026-04-23 17:07:55 +08:00
commit b7e39e063b
16725 changed files with 1625565 additions and 0 deletions
+155
View File
@@ -0,0 +1,155 @@
# Copyright (C) 2018-2019 Garmin Ltd.
#
# SPDX-License-Identifier: GPL-2.0-only
#
import asyncio
from contextlib import closing
import re
import sqlite3
import itertools
import json
UNIX_PREFIX = "unix://"
ADDR_TYPE_UNIX = 0
ADDR_TYPE_TCP = 1
# The Python async server defaults to a 64K receive buffer, so we hardcode our
# maximum chunk size. It would be better if the client and server reported to
# each other what the maximum chunk sizes were, but that will slow down the
# connection setup with a round trip delay so I'd rather not do that unless it
# is necessary
DEFAULT_MAX_CHUNK = 32 * 1024
UNIHASH_TABLE_DEFINITION = (
("method", "TEXT NOT NULL", "UNIQUE"),
("taskhash", "TEXT NOT NULL", "UNIQUE"),
("unihash", "TEXT NOT NULL", ""),
)
UNIHASH_TABLE_COLUMNS = tuple(name for name, _, _ in UNIHASH_TABLE_DEFINITION)
OUTHASH_TABLE_DEFINITION = (
("method", "TEXT NOT NULL", "UNIQUE"),
("taskhash", "TEXT NOT NULL", "UNIQUE"),
("outhash", "TEXT NOT NULL", "UNIQUE"),
("created", "DATETIME", ""),
# Optional fields
("owner", "TEXT", ""),
("PN", "TEXT", ""),
("PV", "TEXT", ""),
("PR", "TEXT", ""),
("task", "TEXT", ""),
("outhash_siginfo", "TEXT", ""),
)
OUTHASH_TABLE_COLUMNS = tuple(name for name, _, _ in OUTHASH_TABLE_DEFINITION)
def _make_table(cursor, name, definition):
cursor.execute('''
CREATE TABLE IF NOT EXISTS {name} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
{fields}
UNIQUE({unique})
)
'''.format(
name=name,
fields=" ".join("%s %s," % (name, typ) for name, typ, _ in definition),
unique=", ".join(name for name, _, flags in definition if "UNIQUE" in flags)
))
def setup_database(database, sync=True):
db = sqlite3.connect(database)
db.row_factory = sqlite3.Row
with closing(db.cursor()) as cursor:
_make_table(cursor, "unihashes_v2", UNIHASH_TABLE_DEFINITION)
_make_table(cursor, "outhashes_v2", OUTHASH_TABLE_DEFINITION)
cursor.execute('PRAGMA journal_mode = WAL')
cursor.execute('PRAGMA synchronous = %s' % ('NORMAL' if sync else 'OFF'))
# Drop old indexes
cursor.execute('DROP INDEX IF EXISTS taskhash_lookup')
cursor.execute('DROP INDEX IF EXISTS outhash_lookup')
cursor.execute('DROP INDEX IF EXISTS taskhash_lookup_v2')
cursor.execute('DROP INDEX IF EXISTS outhash_lookup_v2')
# TODO: Upgrade from tasks_v2?
cursor.execute('DROP TABLE IF EXISTS tasks_v2')
# Create new indexes
cursor.execute('CREATE INDEX IF NOT EXISTS taskhash_lookup_v3 ON unihashes_v2 (method, taskhash)')
cursor.execute('CREATE INDEX IF NOT EXISTS outhash_lookup_v3 ON outhashes_v2 (method, outhash)')
return db
def parse_address(addr):
if addr.startswith(UNIX_PREFIX):
return (ADDR_TYPE_UNIX, (addr[len(UNIX_PREFIX):],))
else:
m = re.match(r'\[(?P<host>[^\]]*)\]:(?P<port>\d+)$', addr)
if m is not None:
host = m.group('host')
port = m.group('port')
else:
host, port = addr.split(':')
return (ADDR_TYPE_TCP, (host, int(port)))
def chunkify(msg, max_chunk):
if len(msg) < max_chunk - 1:
yield ''.join((msg, "\n"))
else:
yield ''.join((json.dumps({
'chunk-stream': None
}), "\n"))
args = [iter(msg)] * (max_chunk - 1)
for m in map(''.join, itertools.zip_longest(*args, fillvalue='')):
yield ''.join(itertools.chain(m, "\n"))
yield "\n"
def create_server(addr, dbname, *, sync=True, upstream=None, read_only=False):
from . import server
db = setup_database(dbname, sync=sync)
s = server.Server(db, upstream=upstream, read_only=read_only)
(typ, a) = parse_address(addr)
if typ == ADDR_TYPE_UNIX:
s.start_unix_server(*a)
else:
s.start_tcp_server(*a)
return s
def create_client(addr):
from . import client
c = client.Client()
(typ, a) = parse_address(addr)
if typ == ADDR_TYPE_UNIX:
c.connect_unix(*a)
else:
c.connect_tcp(*a)
return c
async def create_async_client(addr):
from . import client
c = client.AsyncClient()
(typ, a) = parse_address(addr)
if typ == ADDR_TYPE_UNIX:
await c.connect_unix(*a)
else:
await c.connect_tcp(*a)
return c
+121
View File
@@ -0,0 +1,121 @@
# Copyright (C) 2019 Garmin Ltd.
#
# SPDX-License-Identifier: GPL-2.0-only
#
import logging
import socket
import bb.asyncrpc
from . import create_async_client
logger = logging.getLogger("hashserv.client")
class AsyncClient(bb.asyncrpc.AsyncClient):
MODE_NORMAL = 0
MODE_GET_STREAM = 1
def __init__(self):
super().__init__('OEHASHEQUIV', '1.1', logger)
self.mode = self.MODE_NORMAL
async def setup_connection(self):
await super().setup_connection()
cur_mode = self.mode
self.mode = self.MODE_NORMAL
await self._set_mode(cur_mode)
async def send_stream(self, msg):
async def proc():
self.writer.write(("%s\n" % msg).encode("utf-8"))
await self.writer.drain()
l = await self.reader.readline()
if not l:
raise ConnectionError("Connection closed")
return l.decode("utf-8").rstrip()
return await self._send_wrapper(proc)
async def _set_mode(self, new_mode):
if new_mode == self.MODE_NORMAL and self.mode == self.MODE_GET_STREAM:
r = await self.send_stream("END")
if r != "ok":
raise ConnectionError("Bad response from server %r" % r)
elif new_mode == self.MODE_GET_STREAM and self.mode == self.MODE_NORMAL:
r = await self.send_message({"get-stream": None})
if r != "ok":
raise ConnectionError("Bad response from server %r" % r)
elif new_mode != self.mode:
raise Exception(
"Undefined mode transition %r -> %r" % (self.mode, new_mode)
)
self.mode = new_mode
async def get_unihash(self, method, taskhash):
await self._set_mode(self.MODE_GET_STREAM)
r = await self.send_stream("%s %s" % (method, taskhash))
if not r:
return None
return r
async def report_unihash(self, taskhash, method, outhash, unihash, extra={}):
await self._set_mode(self.MODE_NORMAL)
m = extra.copy()
m["taskhash"] = taskhash
m["method"] = method
m["outhash"] = outhash
m["unihash"] = unihash
return await self.send_message({"report": m})
async def report_unihash_equiv(self, taskhash, method, unihash, extra={}):
await self._set_mode(self.MODE_NORMAL)
m = extra.copy()
m["taskhash"] = taskhash
m["method"] = method
m["unihash"] = unihash
return await self.send_message({"report-equiv": m})
async def get_taskhash(self, method, taskhash, all_properties=False):
await self._set_mode(self.MODE_NORMAL)
return await self.send_message(
{"get": {"taskhash": taskhash, "method": method, "all": all_properties}}
)
async def get_outhash(self, method, outhash, taskhash):
await self._set_mode(self.MODE_NORMAL)
return await self.send_message(
{"get-outhash": {"outhash": outhash, "taskhash": taskhash, "method": method}}
)
async def get_stats(self):
await self._set_mode(self.MODE_NORMAL)
return await self.send_message({"get-stats": None})
async def reset_stats(self):
await self._set_mode(self.MODE_NORMAL)
return await self.send_message({"reset-stats": None})
async def backfill_wait(self):
await self._set_mode(self.MODE_NORMAL)
return (await self.send_message({"backfill-wait": None}))["tasks"]
class Client(bb.asyncrpc.Client):
def __init__(self):
super().__init__()
self._add_methods(
"connect_tcp",
"get_unihash",
"report_unihash",
"report_unihash_equiv",
"get_taskhash",
"get_outhash",
"get_stats",
"reset_stats",
"backfill_wait",
)
def _get_async_client(self):
return AsyncClient()
+562
View File
@@ -0,0 +1,562 @@
# Copyright (C) 2019 Garmin Ltd.
#
# SPDX-License-Identifier: GPL-2.0-only
#
from contextlib import closing, contextmanager
from datetime import datetime
import enum
import asyncio
import logging
import math
import time
from . import create_async_client, UNIHASH_TABLE_COLUMNS, OUTHASH_TABLE_COLUMNS
import bb.asyncrpc
logger = logging.getLogger('hashserv.server')
class Measurement(object):
def __init__(self, sample):
self.sample = sample
def start(self):
self.start_time = time.perf_counter()
def end(self):
self.sample.add(time.perf_counter() - self.start_time)
def __enter__(self):
self.start()
return self
def __exit__(self, *args, **kwargs):
self.end()
class Sample(object):
def __init__(self, stats):
self.stats = stats
self.num_samples = 0
self.elapsed = 0
def measure(self):
return Measurement(self)
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
self.end()
def add(self, elapsed):
self.num_samples += 1
self.elapsed += elapsed
def end(self):
if self.num_samples:
self.stats.add(self.elapsed)
self.num_samples = 0
self.elapsed = 0
class Stats(object):
def __init__(self):
self.reset()
def reset(self):
self.num = 0
self.total_time = 0
self.max_time = 0
self.m = 0
self.s = 0
self.current_elapsed = None
def add(self, elapsed):
self.num += 1
if self.num == 1:
self.m = elapsed
self.s = 0
else:
last_m = self.m
self.m = last_m + (elapsed - last_m) / self.num
self.s = self.s + (elapsed - last_m) * (elapsed - self.m)
self.total_time += elapsed
if self.max_time < elapsed:
self.max_time = elapsed
def start_sample(self):
return Sample(self)
@property
def average(self):
if self.num == 0:
return 0
return self.total_time / self.num
@property
def stdev(self):
if self.num <= 1:
return 0
return math.sqrt(self.s / (self.num - 1))
def todict(self):
return {k: getattr(self, k) for k in ('num', 'total_time', 'max_time', 'average', 'stdev')}
@enum.unique
class Resolve(enum.Enum):
FAIL = enum.auto()
IGNORE = enum.auto()
REPLACE = enum.auto()
def insert_table(cursor, table, data, on_conflict):
resolve = {
Resolve.FAIL: "",
Resolve.IGNORE: " OR IGNORE",
Resolve.REPLACE: " OR REPLACE",
}[on_conflict]
keys = sorted(data.keys())
query = 'INSERT{resolve} INTO {table} ({fields}) VALUES({values})'.format(
resolve=resolve,
table=table,
fields=", ".join(keys),
values=", ".join(":" + k for k in keys),
)
prevrowid = cursor.lastrowid
cursor.execute(query, data)
logging.debug(
"Inserting %r into %s, %s",
data,
table,
on_conflict
)
return (cursor.lastrowid, cursor.lastrowid != prevrowid)
def insert_unihash(cursor, data, on_conflict):
return insert_table(cursor, "unihashes_v2", data, on_conflict)
def insert_outhash(cursor, data, on_conflict):
return insert_table(cursor, "outhashes_v2", data, on_conflict)
async def copy_unihash_from_upstream(client, db, method, taskhash):
d = await client.get_taskhash(method, taskhash)
if d is not None:
with closing(db.cursor()) as cursor:
insert_unihash(
cursor,
{k: v for k, v in d.items() if k in UNIHASH_TABLE_COLUMNS},
Resolve.IGNORE,
)
db.commit()
return d
class ServerCursor(object):
def __init__(self, db, cursor, upstream):
self.db = db
self.cursor = cursor
self.upstream = upstream
class ServerClient(bb.asyncrpc.AsyncServerConnection):
def __init__(self, reader, writer, db, request_stats, backfill_queue, upstream, read_only):
super().__init__(reader, writer, 'OEHASHEQUIV', logger)
self.db = db
self.request_stats = request_stats
self.max_chunk = bb.asyncrpc.DEFAULT_MAX_CHUNK
self.backfill_queue = backfill_queue
self.upstream = upstream
self.handlers.update({
'get': self.handle_get,
'get-outhash': self.handle_get_outhash,
'get-stream': self.handle_get_stream,
'get-stats': self.handle_get_stats,
})
if not read_only:
self.handlers.update({
'report': self.handle_report,
'report-equiv': self.handle_equivreport,
'reset-stats': self.handle_reset_stats,
'backfill-wait': self.handle_backfill_wait,
})
def validate_proto_version(self):
return (self.proto_version > (1, 0) and self.proto_version <= (1, 1))
async def process_requests(self):
if self.upstream is not None:
self.upstream_client = await create_async_client(self.upstream)
else:
self.upstream_client = None
await super().process_requests()
if self.upstream_client is not None:
await self.upstream_client.close()
async def dispatch_message(self, msg):
for k in self.handlers.keys():
if k in msg:
logger.debug('Handling %s' % k)
if 'stream' in k:
await self.handlers[k](msg[k])
else:
with self.request_stats.start_sample() as self.request_sample, \
self.request_sample.measure():
await self.handlers[k](msg[k])
return
raise bb.asyncrpc.ClientError("Unrecognized command %r" % msg)
async def handle_get(self, request):
method = request['method']
taskhash = request['taskhash']
fetch_all = request.get('all', False)
with closing(self.db.cursor()) as cursor:
d = await self.get_unihash(cursor, method, taskhash, fetch_all)
self.write_message(d)
async def get_unihash(self, cursor, method, taskhash, fetch_all=False):
d = None
if fetch_all:
cursor.execute(
'''
SELECT *, unihashes_v2.unihash AS unihash FROM outhashes_v2
INNER JOIN unihashes_v2 ON unihashes_v2.method=outhashes_v2.method AND unihashes_v2.taskhash=outhashes_v2.taskhash
WHERE outhashes_v2.method=:method AND outhashes_v2.taskhash=:taskhash
ORDER BY outhashes_v2.created ASC
LIMIT 1
''',
{
'method': method,
'taskhash': taskhash,
}
)
row = cursor.fetchone()
if row is not None:
d = {k: row[k] for k in row.keys()}
elif self.upstream_client is not None:
d = await self.upstream_client.get_taskhash(method, taskhash, True)
self.update_unified(cursor, d)
self.db.commit()
else:
row = self.query_equivalent(cursor, method, taskhash)
if row is not None:
d = {k: row[k] for k in row.keys()}
elif self.upstream_client is not None:
d = await self.upstream_client.get_taskhash(method, taskhash)
d = {k: v for k, v in d.items() if k in UNIHASH_TABLE_COLUMNS}
insert_unihash(cursor, d, Resolve.IGNORE)
self.db.commit()
return d
async def handle_get_outhash(self, request):
method = request['method']
outhash = request['outhash']
taskhash = request['taskhash']
with closing(self.db.cursor()) as cursor:
d = await self.get_outhash(cursor, method, outhash, taskhash)
self.write_message(d)
async def get_outhash(self, cursor, method, outhash, taskhash):
d = None
cursor.execute(
'''
SELECT *, unihashes_v2.unihash AS unihash FROM outhashes_v2
INNER JOIN unihashes_v2 ON unihashes_v2.method=outhashes_v2.method AND unihashes_v2.taskhash=outhashes_v2.taskhash
WHERE outhashes_v2.method=:method AND outhashes_v2.outhash=:outhash
ORDER BY outhashes_v2.created ASC
LIMIT 1
''',
{
'method': method,
'outhash': outhash,
}
)
row = cursor.fetchone()
if row is not None:
d = {k: row[k] for k in row.keys()}
elif self.upstream_client is not None:
d = await self.upstream_client.get_outhash(method, outhash, taskhash)
self.update_unified(cursor, d)
self.db.commit()
return d
def update_unified(self, cursor, data):
if data is None:
return
insert_unihash(
cursor,
{k: v for k, v in data.items() if k in UNIHASH_TABLE_COLUMNS},
Resolve.IGNORE
)
insert_outhash(
cursor,
{k: v for k, v in data.items() if k in OUTHASH_TABLE_COLUMNS},
Resolve.IGNORE
)
async def handle_get_stream(self, request):
self.write_message('ok')
while True:
upstream = None
l = await self.reader.readline()
if not l:
return
try:
# This inner loop is very sensitive and must be as fast as
# possible (which is why the request sample is handled manually
# instead of using 'with', and also why logging statements are
# commented out.
self.request_sample = self.request_stats.start_sample()
request_measure = self.request_sample.measure()
request_measure.start()
l = l.decode('utf-8').rstrip()
if l == 'END':
self.writer.write('ok\n'.encode('utf-8'))
return
(method, taskhash) = l.split()
#logger.debug('Looking up %s %s' % (method, taskhash))
cursor = self.db.cursor()
try:
row = self.query_equivalent(cursor, method, taskhash)
finally:
cursor.close()
if row is not None:
msg = ('%s\n' % row['unihash']).encode('utf-8')
#logger.debug('Found equivalent task %s -> %s', (row['taskhash'], row['unihash']))
elif self.upstream_client is not None:
upstream = await self.upstream_client.get_unihash(method, taskhash)
if upstream:
msg = ("%s\n" % upstream).encode("utf-8")
else:
msg = "\n".encode("utf-8")
else:
msg = '\n'.encode('utf-8')
self.writer.write(msg)
finally:
request_measure.end()
self.request_sample.end()
await self.writer.drain()
# Post to the backfill queue after writing the result to minimize
# the turn around time on a request
if upstream is not None:
await self.backfill_queue.put((method, taskhash))
async def handle_report(self, data):
with closing(self.db.cursor()) as cursor:
outhash_data = {
'method': data['method'],
'outhash': data['outhash'],
'taskhash': data['taskhash'],
'created': datetime.now()
}
for k in ('owner', 'PN', 'PV', 'PR', 'task', 'outhash_siginfo'):
if k in data:
outhash_data[k] = data[k]
# Insert the new entry, unless it already exists
(rowid, inserted) = insert_outhash(cursor, outhash_data, Resolve.IGNORE)
if inserted:
# If this row is new, check if it is equivalent to another
# output hash
cursor.execute(
'''
SELECT outhashes_v2.taskhash AS taskhash, unihashes_v2.unihash AS unihash FROM outhashes_v2
INNER JOIN unihashes_v2 ON unihashes_v2.method=outhashes_v2.method AND unihashes_v2.taskhash=outhashes_v2.taskhash
-- Select any matching output hash except the one we just inserted
WHERE outhashes_v2.method=:method AND outhashes_v2.outhash=:outhash AND outhashes_v2.taskhash!=:taskhash
-- Pick the oldest hash
ORDER BY outhashes_v2.created ASC
LIMIT 1
''',
{
'method': data['method'],
'outhash': data['outhash'],
'taskhash': data['taskhash'],
}
)
row = cursor.fetchone()
if row is not None:
# A matching output hash was found. Set our taskhash to the
# same unihash since they are equivalent
unihash = row['unihash']
resolve = Resolve.IGNORE
else:
# No matching output hash was found. This is probably the
# first outhash to be added.
unihash = data['unihash']
resolve = Resolve.IGNORE
# Query upstream to see if it has a unihash we can use
if self.upstream_client is not None:
upstream_data = await self.upstream_client.get_outhash(data['method'], data['outhash'], data['taskhash'])
if upstream_data is not None:
unihash = upstream_data['unihash']
insert_unihash(
cursor,
{
'method': data['method'],
'taskhash': data['taskhash'],
'unihash': unihash,
},
resolve
)
unihash_data = await self.get_unihash(cursor, data['method'], data['taskhash'])
if unihash_data is not None:
unihash = unihash_data['unihash']
else:
unihash = data['unihash']
self.db.commit()
d = {
'taskhash': data['taskhash'],
'method': data['method'],
'unihash': unihash,
}
self.write_message(d)
async def handle_equivreport(self, data):
with closing(self.db.cursor()) as cursor:
insert_data = {
'method': data['method'],
'taskhash': data['taskhash'],
'unihash': data['unihash'],
}
insert_unihash(cursor, insert_data, Resolve.IGNORE)
self.db.commit()
# Fetch the unihash that will be reported for the taskhash. If the
# unihash matches, it means this row was inserted (or the mapping
# was already valid)
row = self.query_equivalent(cursor, data['method'], data['taskhash'])
if row['unihash'] == data['unihash']:
logger.info('Adding taskhash equivalence for %s with unihash %s',
data['taskhash'], row['unihash'])
d = {k: row[k] for k in ('taskhash', 'method', 'unihash')}
self.write_message(d)
async def handle_get_stats(self, request):
d = {
'requests': self.request_stats.todict(),
}
self.write_message(d)
async def handle_reset_stats(self, request):
d = {
'requests': self.request_stats.todict(),
}
self.request_stats.reset()
self.write_message(d)
async def handle_backfill_wait(self, request):
d = {
'tasks': self.backfill_queue.qsize(),
}
await self.backfill_queue.join()
self.write_message(d)
def query_equivalent(self, cursor, method, taskhash):
# This is part of the inner loop and must be as fast as possible
cursor.execute(
'SELECT taskhash, method, unihash FROM unihashes_v2 WHERE method=:method AND taskhash=:taskhash',
{
'method': method,
'taskhash': taskhash,
}
)
return cursor.fetchone()
class Server(bb.asyncrpc.AsyncServer):
def __init__(self, db, upstream=None, read_only=False):
if upstream and read_only:
raise bb.asyncrpc.ServerError("Read-only hashserv cannot pull from an upstream server")
super().__init__(logger)
self.request_stats = Stats()
self.db = db
self.upstream = upstream
self.read_only = read_only
def accept_client(self, reader, writer):
return ServerClient(reader, writer, self.db, self.request_stats, self.backfill_queue, self.upstream, self.read_only)
@contextmanager
def _backfill_worker(self):
async def backfill_worker_task():
client = await create_async_client(self.upstream)
try:
while True:
item = await self.backfill_queue.get()
if item is None:
self.backfill_queue.task_done()
break
method, taskhash = item
await copy_unihash_from_upstream(client, self.db, method, taskhash)
self.backfill_queue.task_done()
finally:
await client.close()
async def join_worker(worker):
await self.backfill_queue.put(None)
await worker
if self.upstream is not None:
worker = asyncio.ensure_future(backfill_worker_task())
try:
yield
finally:
self.loop.run_until_complete(join_worker(worker))
else:
yield
def run_loop_forever(self):
self.backfill_queue = asyncio.Queue()
with self._backfill_worker():
super().run_loop_forever()
+433
View File
@@ -0,0 +1,433 @@
#! /usr/bin/env python3
#
# Copyright (C) 2018-2019 Garmin Ltd.
#
# SPDX-License-Identifier: GPL-2.0-only
#
from . import create_server, create_client
import hashlib
import logging
import multiprocessing
import os
import sys
import tempfile
import threading
import unittest
import socket
import time
import signal
def server_prefunc(server, idx):
logging.basicConfig(level=logging.DEBUG, filename='bbhashserv-%d.log' % idx, filemode='w',
format='%(levelname)s %(filename)s:%(lineno)d %(message)s')
server.logger.debug("Running server %d" % idx)
sys.stdout = open('bbhashserv-stdout-%d.log' % idx, 'w')
sys.stderr = sys.stdout
class HashEquivalenceTestSetup(object):
METHOD = 'TestMethod'
server_index = 0
def start_server(self, dbpath=None, upstream=None, read_only=False, prefunc=server_prefunc):
self.server_index += 1
if dbpath is None:
dbpath = os.path.join(self.temp_dir.name, "db%d.sqlite" % self.server_index)
def cleanup_server(server):
if server.process.exitcode is not None:
return
server.process.terminate()
server.process.join()
server = create_server(self.get_server_addr(self.server_index),
dbpath,
upstream=upstream,
read_only=read_only)
server.dbpath = dbpath
server.serve_as_process(prefunc=prefunc, args=(self.server_index,))
self.addCleanup(cleanup_server, server)
def cleanup_client(client):
client.close()
client = create_client(server.address)
self.addCleanup(cleanup_client, client)
return (client, server)
def setUp(self):
if sys.version_info < (3, 5, 0):
self.skipTest('Python 3.5 or later required')
self.temp_dir = tempfile.TemporaryDirectory(prefix='bb-hashserv')
self.addCleanup(self.temp_dir.cleanup)
(self.client, self.server) = self.start_server()
def assertClientGetHash(self, client, taskhash, unihash):
result = client.get_unihash(self.METHOD, taskhash)
self.assertEqual(result, unihash)
class HashEquivalenceCommonTests(object):
def test_create_hash(self):
# Simple test that hashes can be created
taskhash = '35788efcb8dfb0a02659d81cf2bfd695fb30faf9'
outhash = '2765d4a5884be49b28601445c2760c5f21e7e5c0ee2b7e3fce98fd7e5970796f'
unihash = 'f46d3fbb439bd9b921095da657a4de906510d2cd'
self.assertClientGetHash(self.client, taskhash, None)
result = self.client.report_unihash(taskhash, self.METHOD, outhash, unihash)
self.assertEqual(result['unihash'], unihash, 'Server returned bad unihash')
def test_create_equivalent(self):
# Tests that a second reported task with the same outhash will be
# assigned the same unihash
taskhash = '53b8dce672cb6d0c73170be43f540460bfc347b4'
outhash = '5a9cb1649625f0bf41fc7791b635cd9c2d7118c7f021ba87dcd03f72b67ce7a8'
unihash = 'f37918cc02eb5a520b1aff86faacbc0a38124646'
result = self.client.report_unihash(taskhash, self.METHOD, outhash, unihash)
self.assertEqual(result['unihash'], unihash, 'Server returned bad unihash')
# Report a different task with the same outhash. The returned unihash
# should match the first task
taskhash2 = '3bf6f1e89d26205aec90da04854fbdbf73afe6b4'
unihash2 = 'af36b199320e611fbb16f1f277d3ee1d619ca58b'
result = self.client.report_unihash(taskhash2, self.METHOD, outhash, unihash2)
self.assertEqual(result['unihash'], unihash, 'Server returned bad unihash')
def test_duplicate_taskhash(self):
# Tests that duplicate reports of the same taskhash with different
# outhash & unihash always return the unihash from the first reported
# taskhash
taskhash = '8aa96fcffb5831b3c2c0cb75f0431e3f8b20554a'
outhash = 'afe240a439959ce86f5e322f8c208e1fedefea9e813f2140c81af866cc9edf7e'
unihash = '218e57509998197d570e2c98512d0105985dffc9'
self.client.report_unihash(taskhash, self.METHOD, outhash, unihash)
self.assertClientGetHash(self.client, taskhash, unihash)
outhash2 = '0904a7fe3dc712d9fd8a74a616ddca2a825a8ee97adf0bd3fc86082c7639914d'
unihash2 = 'ae9a7d252735f0dafcdb10e2e02561ca3a47314c'
self.client.report_unihash(taskhash, self.METHOD, outhash2, unihash2)
self.assertClientGetHash(self.client, taskhash, unihash)
outhash3 = '77623a549b5b1a31e3732dfa8fe61d7ce5d44b3370f253c5360e136b852967b4'
unihash3 = '9217a7d6398518e5dc002ed58f2cbbbc78696603'
self.client.report_unihash(taskhash, self.METHOD, outhash3, unihash3)
self.assertClientGetHash(self.client, taskhash, unihash)
def test_huge_message(self):
# Simple test that hashes can be created
taskhash = 'c665584ee6817aa99edfc77a44dd853828279370'
outhash = '3c979c3db45c569f51ab7626a4651074be3a9d11a84b1db076f5b14f7d39db44'
unihash = '90e9bc1d1f094c51824adca7f8ea79a048d68824'
self.assertClientGetHash(self.client, taskhash, None)
siginfo = "0" * (self.client.max_chunk * 4)
result = self.client.report_unihash(taskhash, self.METHOD, outhash, unihash, {
'outhash_siginfo': siginfo
})
self.assertEqual(result['unihash'], unihash, 'Server returned bad unihash')
result_unihash = self.client.get_taskhash(self.METHOD, taskhash, True)
self.assertEqual(result_unihash['taskhash'], taskhash)
self.assertEqual(result_unihash['unihash'], unihash)
self.assertEqual(result_unihash['method'], self.METHOD)
result_outhash = self.client.get_outhash(self.METHOD, outhash, taskhash)
self.assertEqual(result_outhash['taskhash'], taskhash)
self.assertEqual(result_outhash['method'], self.METHOD)
self.assertEqual(result_outhash['unihash'], unihash)
self.assertEqual(result_outhash['outhash'], outhash)
self.assertEqual(result_outhash['outhash_siginfo'], siginfo)
def test_stress(self):
def query_server(failures):
client = Client(self.server.address)
try:
for i in range(1000):
taskhash = hashlib.sha256()
taskhash.update(str(i).encode('utf-8'))
taskhash = taskhash.hexdigest()
result = client.get_unihash(self.METHOD, taskhash)
if result != taskhash:
failures.append("taskhash mismatch: %s != %s" % (result, taskhash))
finally:
client.close()
# Report hashes
for i in range(1000):
taskhash = hashlib.sha256()
taskhash.update(str(i).encode('utf-8'))
taskhash = taskhash.hexdigest()
self.client.report_unihash(taskhash, self.METHOD, taskhash, taskhash)
failures = []
threads = [threading.Thread(target=query_server, args=(failures,)) for t in range(100)]
for t in threads:
t.start()
for t in threads:
t.join()
self.assertFalse(failures)
def test_upstream_server(self):
# Tests upstream server support. This is done by creating two servers
# that share a database file. The downstream server has it upstream
# set to the test server, whereas the side server doesn't. This allows
# verification that the hash requests are being proxied to the upstream
# server by verifying that they appear on the downstream client, but not
# the side client. It also verifies that the results are pulled into
# the downstream database by checking that the downstream and side servers
# match after the downstream is done waiting for all backfill tasks
(down_client, down_server) = self.start_server(upstream=self.server.address)
(side_client, side_server) = self.start_server(dbpath=down_server.dbpath)
def check_hash(taskhash, unihash, old_sidehash):
nonlocal down_client
nonlocal side_client
# check upstream server
self.assertClientGetHash(self.client, taskhash, unihash)
# Hash should *not* be present on the side server
self.assertClientGetHash(side_client, taskhash, old_sidehash)
# Hash should be present on the downstream server, since it
# will defer to the upstream server. This will trigger
# the backfill in the downstream server
self.assertClientGetHash(down_client, taskhash, unihash)
# After waiting for the downstream client to finish backfilling the
# task from the upstream server, it should appear in the side server
# since the database is populated
down_client.backfill_wait()
self.assertClientGetHash(side_client, taskhash, unihash)
# Basic report
taskhash = '8aa96fcffb5831b3c2c0cb75f0431e3f8b20554a'
outhash = 'afe240a439959ce86f5e322f8c208e1fedefea9e813f2140c81af866cc9edf7e'
unihash = '218e57509998197d570e2c98512d0105985dffc9'
self.client.report_unihash(taskhash, self.METHOD, outhash, unihash)
check_hash(taskhash, unihash, None)
# Duplicated taskhash with multiple output hashes and unihashes.
# All servers should agree with the originally reported hash
outhash2 = '0904a7fe3dc712d9fd8a74a616ddca2a825a8ee97adf0bd3fc86082c7639914d'
unihash2 = 'ae9a7d252735f0dafcdb10e2e02561ca3a47314c'
self.client.report_unihash(taskhash, self.METHOD, outhash2, unihash2)
check_hash(taskhash, unihash, unihash)
# Report an equivalent task. The sideload will originally report
# no unihash until backfilled
taskhash3 = "044c2ec8aaf480685a00ff6ff49e6162e6ad34e1"
unihash3 = "def64766090d28f627e816454ed46894bb3aab36"
self.client.report_unihash(taskhash3, self.METHOD, outhash, unihash3)
check_hash(taskhash3, unihash, None)
# Test that reporting a unihash in the downstream client isn't
# propagating to the upstream server
taskhash4 = "e3da00593d6a7fb435c7e2114976c59c5fd6d561"
outhash4 = "1cf8713e645f491eb9c959d20b5cae1c47133a292626dda9b10709857cbe688a"
unihash4 = "3b5d3d83f07f259e9086fcb422c855286e18a57d"
down_client.report_unihash(taskhash4, self.METHOD, outhash4, unihash4)
down_client.backfill_wait()
self.assertClientGetHash(down_client, taskhash4, unihash4)
self.assertClientGetHash(side_client, taskhash4, unihash4)
self.assertClientGetHash(self.client, taskhash4, None)
# Test that reporting a unihash in the downstream is able to find a
# match which was previously reported to the upstream server
taskhash5 = '35788efcb8dfb0a02659d81cf2bfd695fb30faf9'
outhash5 = '2765d4a5884be49b28601445c2760c5f21e7e5c0ee2b7e3fce98fd7e5970796f'
unihash5 = 'f46d3fbb439bd9b921095da657a4de906510d2cd'
result = self.client.report_unihash(taskhash5, self.METHOD, outhash5, unihash5)
taskhash6 = '35788efcb8dfb0a02659d81cf2bfd695fb30fafa'
unihash6 = 'f46d3fbb439bd9b921095da657a4de906510d2ce'
result = down_client.report_unihash(taskhash6, self.METHOD, outhash5, unihash6)
self.assertEqual(result['unihash'], unihash5, 'Server failed to copy unihash from upstream')
# Tests read through from server with
taskhash7 = '9d81d76242cc7cfaf7bf74b94b9cd2e29324ed74'
outhash7 = '8470d56547eea6236d7c81a644ce74670ca0bbda998e13c629ef6bb3f0d60b69'
unihash7 = '05d2a63c81e32f0a36542ca677e8ad852365c538'
self.client.report_unihash(taskhash7, self.METHOD, outhash7, unihash7)
result = down_client.get_taskhash(self.METHOD, taskhash7, True)
self.assertEqual(result['unihash'], unihash7, 'Server failed to copy unihash from upstream')
self.assertEqual(result['outhash'], outhash7, 'Server failed to copy unihash from upstream')
self.assertEqual(result['taskhash'], taskhash7, 'Server failed to copy unihash from upstream')
self.assertEqual(result['method'], self.METHOD)
taskhash8 = '86978a4c8c71b9b487330b0152aade10c1ee58aa'
outhash8 = 'ca8c128e9d9e4a28ef24d0508aa20b5cf880604eacd8f65c0e366f7e0cc5fbcf'
unihash8 = 'd8bcf25369d40590ad7d08c84d538982f2023e01'
self.client.report_unihash(taskhash8, self.METHOD, outhash8, unihash8)
result = down_client.get_outhash(self.METHOD, outhash8, taskhash8)
self.assertEqual(result['unihash'], unihash8, 'Server failed to copy unihash from upstream')
self.assertEqual(result['outhash'], outhash8, 'Server failed to copy unihash from upstream')
self.assertEqual(result['taskhash'], taskhash8, 'Server failed to copy unihash from upstream')
self.assertEqual(result['method'], self.METHOD)
taskhash9 = 'ae6339531895ddf5b67e663e6a374ad8ec71d81c'
outhash9 = 'afc78172c81880ae10a1fec994b5b4ee33d196a001a1b66212a15ebe573e00b5'
unihash9 = '6662e699d6e3d894b24408ff9a4031ef9b038ee8'
self.client.report_unihash(taskhash9, self.METHOD, outhash9, unihash9)
result = down_client.get_taskhash(self.METHOD, taskhash9, False)
self.assertEqual(result['unihash'], unihash9, 'Server failed to copy unihash from upstream')
self.assertEqual(result['taskhash'], taskhash9, 'Server failed to copy unihash from upstream')
self.assertEqual(result['method'], self.METHOD)
def test_ro_server(self):
(ro_client, ro_server) = self.start_server(dbpath=self.server.dbpath, read_only=True)
# Report a hash via the read-write server
taskhash = '35788efcb8dfb0a02659d81cf2bfd695fb30faf9'
outhash = '2765d4a5884be49b28601445c2760c5f21e7e5c0ee2b7e3fce98fd7e5970796f'
unihash = 'f46d3fbb439bd9b921095da657a4de906510d2cd'
result = self.client.report_unihash(taskhash, self.METHOD, outhash, unihash)
self.assertEqual(result['unihash'], unihash, 'Server returned bad unihash')
# Check the hash via the read-only server
self.assertClientGetHash(ro_client, taskhash, unihash)
# Ensure that reporting via the read-only server fails
taskhash2 = 'c665584ee6817aa99edfc77a44dd853828279370'
outhash2 = '3c979c3db45c569f51ab7626a4651074be3a9d11a84b1db076f5b14f7d39db44'
unihash2 = '90e9bc1d1f094c51824adca7f8ea79a048d68824'
with self.assertRaises(ConnectionError):
ro_client.report_unihash(taskhash2, self.METHOD, outhash2, unihash2)
# Ensure that the database was not modified
self.assertClientGetHash(self.client, taskhash2, None)
def test_slow_server_start(self):
# Ensures that the server will exit correctly even if it gets a SIGTERM
# before entering the main loop
event = multiprocessing.Event()
def prefunc(server, idx):
nonlocal event
server_prefunc(server, idx)
event.wait()
def do_nothing(signum, frame):
pass
old_signal = signal.signal(signal.SIGTERM, do_nothing)
self.addCleanup(signal.signal, signal.SIGTERM, old_signal)
_, server = self.start_server(prefunc=prefunc)
server.process.terminate()
time.sleep(30)
event.set()
server.process.join(300)
self.assertIsNotNone(server.process.exitcode, "Server did not exit in a timely manner!")
def test_diverging_report_race(self):
# Tests that a reported task will correctly pick up an updated unihash
# This is a baseline report added to the database to ensure that there
# is something to match against as equivalent
outhash1 = 'afd11c366050bcd75ad763e898e4430e2a60659b26f83fbb22201a60672019fa'
taskhash1 = '3bde230c743fc45ab61a065d7a1815fbfa01c4740e4c895af2eb8dc0f684a4ab'
unihash1 = '3bde230c743fc45ab61a065d7a1815fbfa01c4740e4c895af2eb8dc0f684a4ab'
result = self.client.report_unihash(taskhash1, self.METHOD, outhash1, unihash1)
# Add a report that is equivalent to Task 1. It should ignore the
# provided unihash and report the unihash from task 1
taskhash2 = '6259ae8263bd94d454c086f501c37e64c4e83cae806902ca95b4ab513546b273'
unihash2 = taskhash2
result = self.client.report_unihash(taskhash2, self.METHOD, outhash1, unihash2)
self.assertEqual(result['unihash'], unihash1)
# Add another report for Task 2, but with a different outhash (e.g. the
# task is non-deterministic). It should still be marked with the Task 1
# unihash because it has the Task 2 taskhash, which is equivalent to
# Task 1
outhash3 = 'd2187ee3a8966db10b34fe0e863482288d9a6185cb8ef58a6c1c6ace87a2f24c'
result = self.client.report_unihash(taskhash2, self.METHOD, outhash3, unihash2)
self.assertEqual(result['unihash'], unihash1)
def test_diverging_report_reverse_race(self):
# Same idea as the previous test, but Tasks 2 and 3 are reported in
# reverse order the opposite order
outhash1 = 'afd11c366050bcd75ad763e898e4430e2a60659b26f83fbb22201a60672019fa'
taskhash1 = '3bde230c743fc45ab61a065d7a1815fbfa01c4740e4c895af2eb8dc0f684a4ab'
unihash1 = '3bde230c743fc45ab61a065d7a1815fbfa01c4740e4c895af2eb8dc0f684a4ab'
result = self.client.report_unihash(taskhash1, self.METHOD, outhash1, unihash1)
taskhash2 = '6259ae8263bd94d454c086f501c37e64c4e83cae806902ca95b4ab513546b273'
unihash2 = taskhash2
# Report Task 3 first. Since there is nothing else in the database it
# will use the client provided unihash
outhash3 = 'd2187ee3a8966db10b34fe0e863482288d9a6185cb8ef58a6c1c6ace87a2f24c'
result = self.client.report_unihash(taskhash2, self.METHOD, outhash3, unihash2)
self.assertEqual(result['unihash'], unihash2)
# Report Task 2. This is equivalent to Task 1 but there is already a mapping for
# taskhash2 so it will report unihash2
result = self.client.report_unihash(taskhash2, self.METHOD, outhash1, unihash2)
self.assertEqual(result['unihash'], unihash2)
# The originally reported unihash for Task 3 should be unchanged even if it
# shares a taskhash with Task 2
self.assertClientGetHash(self.client, taskhash2, unihash2)
class TestHashEquivalenceUnixServer(HashEquivalenceTestSetup, HashEquivalenceCommonTests, unittest.TestCase):
def get_server_addr(self, server_idx):
return "unix://" + os.path.join(self.temp_dir.name, 'sock%d' % server_idx)
class TestHashEquivalenceUnixServerLongPath(HashEquivalenceTestSetup, unittest.TestCase):
DEEP_DIRECTORY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/ccccccccccccccccccccccccccccccccccccccccccc"
def get_server_addr(self, server_idx):
os.makedirs(os.path.join(self.temp_dir.name, self.DEEP_DIRECTORY), exist_ok=True)
return "unix://" + os.path.join(self.temp_dir.name, self.DEEP_DIRECTORY, 'sock%d' % server_idx)
def test_long_sock_path(self):
# Simple test that hashes can be created
taskhash = '35788efcb8dfb0a02659d81cf2bfd695fb30faf9'
outhash = '2765d4a5884be49b28601445c2760c5f21e7e5c0ee2b7e3fce98fd7e5970796f'
unihash = 'f46d3fbb439bd9b921095da657a4de906510d2cd'
self.assertClientGetHash(self.client, taskhash, None)
result = self.client.report_unihash(taskhash, self.METHOD, outhash, unihash)
self.assertEqual(result['unihash'], unihash, 'Server returned bad unihash')
class TestHashEquivalenceTCPServer(HashEquivalenceTestSetup, HashEquivalenceCommonTests, unittest.TestCase):
def get_server_addr(self, server_idx):
# Some hosts cause asyncio module to misbehave, when IPv6 is not enabled.
# If IPv6 is enabled, it should be safe to use localhost directly, in general
# case it is more reliable to resolve the IP address explicitly.
return socket.gethostbyname("localhost") + ":0"