472 lines
17 KiB
Python
472 lines
17 KiB
Python
|
import asyncio
|
||
|
import contextlib
|
||
|
|
||
|
from concurrent.futures import ALL_COMPLETED
|
||
|
from async_timeout import timeout as async_timeout
|
||
|
|
||
|
from ..log import sentinel_logger
|
||
|
from ..pubsub import Receiver
|
||
|
from ..pool import create_pool, ConnectionsPool
|
||
|
from ..errors import (
|
||
|
MasterNotFoundError,
|
||
|
SlaveNotFoundError,
|
||
|
PoolClosedError,
|
||
|
RedisError,
|
||
|
MasterReplyError,
|
||
|
SlaveReplyError,
|
||
|
)
|
||
|
from ..util import CloseEvent
|
||
|
|
||
|
|
||
|
# Address marker for discovery
|
||
|
_NON_DISCOVERED = object()
|
||
|
|
||
|
_logger = sentinel_logger.getChild('monitor')
|
||
|
|
||
|
|
||
|
async def create_sentinel_pool(sentinels, *, db=None, password=None,
|
||
|
encoding=None, minsize=1, maxsize=10,
|
||
|
ssl=None, parser=None, timeout=0.2, loop=None):
|
||
|
"""Create SentinelPool."""
|
||
|
# FIXME: revise default timeout value
|
||
|
assert isinstance(sentinels, (list, tuple)), sentinels
|
||
|
# TODO: deprecation note
|
||
|
# if loop is None:
|
||
|
# loop = asyncio.get_event_loop()
|
||
|
|
||
|
pool = SentinelPool(sentinels, db=db,
|
||
|
password=password,
|
||
|
ssl=ssl,
|
||
|
encoding=encoding,
|
||
|
parser=parser,
|
||
|
minsize=minsize,
|
||
|
maxsize=maxsize,
|
||
|
timeout=timeout,
|
||
|
loop=loop)
|
||
|
await pool.discover()
|
||
|
return pool
|
||
|
|
||
|
|
||
|
class SentinelPool:
|
||
|
"""Sentinel connections pool.
|
||
|
|
||
|
Holds connection pools to known and discovered (TBD) Sentinels
|
||
|
as well as services' connections.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, sentinels, *, db=None, password=None, ssl=None,
|
||
|
encoding=None, parser=None, minsize, maxsize, timeout,
|
||
|
loop=None):
|
||
|
# TODO: deprecation note
|
||
|
# if loop is None:
|
||
|
# loop = asyncio.get_event_loop()
|
||
|
# TODO: add connection/discover timeouts;
|
||
|
# and what to do if no master is found:
|
||
|
# (raise error or try forever or try until timeout)
|
||
|
|
||
|
# XXX: _sentinels is unordered
|
||
|
self._sentinels = set(sentinels)
|
||
|
self._timeout = timeout
|
||
|
self._pools = [] # list of sentinel pools
|
||
|
self._masters = {}
|
||
|
self._slaves = {}
|
||
|
self._parser_class = parser
|
||
|
self._redis_db = db
|
||
|
self._redis_password = password
|
||
|
self._redis_ssl = ssl
|
||
|
self._redis_encoding = encoding
|
||
|
self._redis_minsize = minsize
|
||
|
self._redis_maxsize = maxsize
|
||
|
self._close_state = CloseEvent(self._do_close)
|
||
|
self._close_waiter = None
|
||
|
self._monitor = monitor = Receiver()
|
||
|
|
||
|
async def echo_events():
|
||
|
try:
|
||
|
while await monitor.wait_message():
|
||
|
_, (ev, data) = await monitor.get(encoding='utf-8')
|
||
|
ev = ev.decode('utf-8')
|
||
|
_logger.debug("%s: %s", ev, data)
|
||
|
if ev in ('+odown',):
|
||
|
typ, name, *tail = data.split(' ')
|
||
|
if typ == 'master':
|
||
|
self._need_rediscover(name)
|
||
|
# TODO: parse messages;
|
||
|
# watch +new-epoch which signals `failover in progres`
|
||
|
# freeze reconnection
|
||
|
# wait / discover new master (find proper way)
|
||
|
# unfreeze reconnection
|
||
|
#
|
||
|
# discover master in default way
|
||
|
# get-master-addr...
|
||
|
# connnect
|
||
|
# role
|
||
|
# etc...
|
||
|
except asyncio.CancelledError:
|
||
|
pass
|
||
|
self._monitor_task = asyncio.ensure_future(echo_events())
|
||
|
|
||
|
@property
|
||
|
def discover_timeout(self):
|
||
|
"""Timeout (seconds) for Redis/Sentinel command calls during
|
||
|
master/slave address discovery.
|
||
|
"""
|
||
|
return self._timeout
|
||
|
|
||
|
def master_for(self, service):
|
||
|
"""Returns wrapper to master's pool for requested service."""
|
||
|
# TODO: make it coroutine and connect minsize connections
|
||
|
if service not in self._masters:
|
||
|
self._masters[service] = ManagedPool(
|
||
|
self, service, is_master=True,
|
||
|
db=self._redis_db,
|
||
|
password=self._redis_password,
|
||
|
encoding=self._redis_encoding,
|
||
|
minsize=self._redis_minsize,
|
||
|
maxsize=self._redis_maxsize,
|
||
|
ssl=self._redis_ssl,
|
||
|
parser=self._parser_class,
|
||
|
)
|
||
|
return self._masters[service]
|
||
|
|
||
|
def slave_for(self, service):
|
||
|
"""Returns wrapper to slave's pool for requested service."""
|
||
|
# TODO: make it coroutine and connect minsize connections
|
||
|
if service not in self._slaves:
|
||
|
self._slaves[service] = ManagedPool(
|
||
|
self, service, is_master=False,
|
||
|
db=self._redis_db,
|
||
|
password=self._redis_password,
|
||
|
encoding=self._redis_encoding,
|
||
|
minsize=self._redis_minsize,
|
||
|
maxsize=self._redis_maxsize,
|
||
|
ssl=self._redis_ssl,
|
||
|
parser=self._parser_class,
|
||
|
)
|
||
|
return self._slaves[service]
|
||
|
|
||
|
def execute(self, command, *args, **kwargs):
|
||
|
"""Execute sentinel command."""
|
||
|
# TODO: choose pool
|
||
|
# kwargs can be used to control which sentinel to use
|
||
|
if self.closed:
|
||
|
raise PoolClosedError("Sentinel pool is closed")
|
||
|
for pool in self._pools:
|
||
|
return pool.execute(command, *args, **kwargs)
|
||
|
# how to handle errors and pick other pool?
|
||
|
# is the only way to make it coroutine?
|
||
|
|
||
|
@property
|
||
|
def closed(self):
|
||
|
"""True if pool is closed or closing."""
|
||
|
return self._close_state.is_set()
|
||
|
|
||
|
def close(self):
|
||
|
"""Close all controlled connections (both sentinel and redis)."""
|
||
|
if not self._close_state.is_set():
|
||
|
self._close_state.set()
|
||
|
|
||
|
async def _do_close(self):
|
||
|
# TODO: lock
|
||
|
tasks = []
|
||
|
task, self._monitor_task = self._monitor_task, None
|
||
|
task.cancel()
|
||
|
tasks.append(task)
|
||
|
while self._pools:
|
||
|
pool = self._pools.pop(0)
|
||
|
pool.close()
|
||
|
tasks.append(pool.wait_closed())
|
||
|
while self._masters:
|
||
|
_, pool = self._masters.popitem()
|
||
|
pool.close()
|
||
|
tasks.append(pool.wait_closed())
|
||
|
while self._slaves:
|
||
|
_, pool = self._slaves.popitem()
|
||
|
pool.close()
|
||
|
tasks.append(pool.wait_closed())
|
||
|
await asyncio.gather(*tasks)
|
||
|
|
||
|
async def wait_closed(self):
|
||
|
"""Wait until pool gets closed."""
|
||
|
await self._close_state.wait()
|
||
|
|
||
|
async def discover(self, timeout=None): # TODO: better name?
|
||
|
"""Discover sentinels and all monitored services within given timeout.
|
||
|
|
||
|
If no sentinels discovered within timeout: TimeoutError is raised.
|
||
|
If some sentinels were discovered but not all — it is ok.
|
||
|
If not all monitored services (masters/slaves) discovered
|
||
|
(or connections established) — it is ok.
|
||
|
TBD: what if some sentinels/services unreachable;
|
||
|
"""
|
||
|
# TODO: check not closed
|
||
|
# TODO: discovery must be done with some customizable timeout.
|
||
|
if timeout is None:
|
||
|
timeout = self.discover_timeout
|
||
|
tasks = []
|
||
|
pools = []
|
||
|
for addr in self._sentinels: # iterate over unordered set
|
||
|
tasks.append(self._connect_sentinel(addr, timeout, pools))
|
||
|
done, pending = await asyncio.wait(tasks,
|
||
|
return_when=ALL_COMPLETED)
|
||
|
assert not pending, ("Expected all tasks to complete", done, pending)
|
||
|
|
||
|
for task in done:
|
||
|
result = task.result()
|
||
|
if isinstance(result, Exception):
|
||
|
continue # FIXME
|
||
|
if not pools:
|
||
|
raise Exception("Could not connect to any sentinel")
|
||
|
pools, self._pools[:] = self._pools[:], pools
|
||
|
# TODO: close current connections
|
||
|
for pool in pools:
|
||
|
pool.close()
|
||
|
await pool.wait_closed()
|
||
|
|
||
|
# TODO: discover peer sentinels
|
||
|
for pool in self._pools:
|
||
|
await pool.execute_pubsub(
|
||
|
b'psubscribe', self._monitor.pattern('*'))
|
||
|
|
||
|
async def _connect_sentinel(self, address, timeout, pools):
|
||
|
"""Try to connect to specified Sentinel returning either
|
||
|
connections pool or exception.
|
||
|
"""
|
||
|
try:
|
||
|
with async_timeout(timeout):
|
||
|
pool = await create_pool(
|
||
|
address, minsize=1, maxsize=2,
|
||
|
parser=self._parser_class,
|
||
|
)
|
||
|
pools.append(pool)
|
||
|
return pool
|
||
|
except asyncio.TimeoutError as err:
|
||
|
sentinel_logger.debug(
|
||
|
"Failed to connect to Sentinel(%r) within %ss timeout",
|
||
|
address, timeout)
|
||
|
return err
|
||
|
except Exception as err:
|
||
|
sentinel_logger.debug(
|
||
|
"Error connecting to Sentinel(%r): %r", address, err)
|
||
|
return err
|
||
|
|
||
|
async def discover_master(self, service, timeout):
|
||
|
"""Perform Master discovery for specified service."""
|
||
|
# TODO: get lock
|
||
|
idle_timeout = timeout
|
||
|
# FIXME: single timeout used 4 times;
|
||
|
# meaning discovery can take up to:
|
||
|
# 3 * timeout * (sentinels count)
|
||
|
#
|
||
|
# having one global timeout also can leed to
|
||
|
# a problem when not all sentinels are checked.
|
||
|
|
||
|
# use a copy, cause pools can change
|
||
|
pools = self._pools[:]
|
||
|
for sentinel in pools:
|
||
|
try:
|
||
|
with async_timeout(timeout):
|
||
|
address = await self._get_masters_address(
|
||
|
sentinel, service)
|
||
|
|
||
|
pool = self._masters[service]
|
||
|
with async_timeout(timeout), \
|
||
|
contextlib.ExitStack() as stack:
|
||
|
conn = await pool._create_new_connection(address)
|
||
|
stack.callback(conn.close)
|
||
|
await self._verify_service_role(conn, 'master')
|
||
|
stack.pop_all()
|
||
|
|
||
|
return conn
|
||
|
except asyncio.CancelledError:
|
||
|
# we must correctly handle CancelledError(s):
|
||
|
# application may be stopped or function can be cancelled
|
||
|
# by outer timeout, so we must stop the look up.
|
||
|
raise
|
||
|
except asyncio.TimeoutError:
|
||
|
continue
|
||
|
except DiscoverError as err:
|
||
|
sentinel_logger.debug("DiscoverError(%r, %s): %r",
|
||
|
sentinel, service, err)
|
||
|
await asyncio.sleep(idle_timeout)
|
||
|
continue
|
||
|
except RedisError as err:
|
||
|
raise MasterReplyError("Service {} error".format(service), err)
|
||
|
except Exception:
|
||
|
# TODO: clear (drop) connections to schedule reconnect
|
||
|
await asyncio.sleep(idle_timeout)
|
||
|
continue
|
||
|
# Otherwise
|
||
|
raise MasterNotFoundError("No master found for {}".format(service))
|
||
|
|
||
|
async def discover_slave(self, service, timeout, **kwargs):
|
||
|
"""Perform Slave discovery for specified service."""
|
||
|
# TODO: use kwargs to change how slaves are picked up
|
||
|
# (eg: round-robin, priority, random, etc)
|
||
|
idle_timeout = timeout
|
||
|
pools = self._pools[:]
|
||
|
for sentinel in pools:
|
||
|
try:
|
||
|
with async_timeout(timeout):
|
||
|
address = await self._get_slave_address(
|
||
|
sentinel, service) # add **kwargs
|
||
|
pool = self._slaves[service]
|
||
|
with async_timeout(timeout), \
|
||
|
contextlib.ExitStack() as stack:
|
||
|
conn = await pool._create_new_connection(address)
|
||
|
stack.callback(conn.close)
|
||
|
await self._verify_service_role(conn, 'slave')
|
||
|
stack.pop_all()
|
||
|
return conn
|
||
|
except asyncio.CancelledError:
|
||
|
raise
|
||
|
except asyncio.TimeoutError:
|
||
|
continue
|
||
|
except DiscoverError:
|
||
|
await asyncio.sleep(idle_timeout)
|
||
|
continue
|
||
|
except RedisError as err:
|
||
|
raise SlaveReplyError("Service {} error".format(service), err)
|
||
|
except Exception:
|
||
|
await asyncio.sleep(idle_timeout)
|
||
|
continue
|
||
|
raise SlaveNotFoundError("No slave found for {}".format(service))
|
||
|
|
||
|
async def _get_masters_address(self, sentinel, service):
|
||
|
# NOTE: we don't use `get-master-addr-by-name`
|
||
|
# as it can provide stale data so we repeat
|
||
|
# after redis-py and check service flags.
|
||
|
state = await sentinel.execute(b'sentinel', b'master',
|
||
|
service, encoding='utf-8')
|
||
|
if not state:
|
||
|
raise UnknownService()
|
||
|
state = make_dict(state)
|
||
|
address = state['ip'], int(state['port'])
|
||
|
flags = set(state['flags'].split(','))
|
||
|
if {'s_down', 'o_down', 'disconnected'} & flags:
|
||
|
raise BadState(state)
|
||
|
return address
|
||
|
|
||
|
async def _get_slave_address(self, sentinel, service):
|
||
|
# Find and return single slave address
|
||
|
slaves = await sentinel.execute(b'sentinel', b'slaves',
|
||
|
service, encoding='utf-8')
|
||
|
if not slaves:
|
||
|
raise UnknownService()
|
||
|
for state in map(make_dict, slaves):
|
||
|
address = state['ip'], int(state['port'])
|
||
|
flags = set(state['flags'].split(','))
|
||
|
if {'s_down', 'o_down', 'disconnected'} & flags:
|
||
|
continue
|
||
|
return address
|
||
|
raise BadState() # XXX: only last state
|
||
|
|
||
|
async def _verify_service_role(self, conn, role):
|
||
|
res = await conn.execute(b'role', encoding='utf-8')
|
||
|
if res[0] != role:
|
||
|
raise RoleMismatch(res)
|
||
|
|
||
|
def _need_rediscover(self, service):
|
||
|
sentinel_logger.debug("Must redisover service %s", service)
|
||
|
pool = self._masters.get(service)
|
||
|
if pool:
|
||
|
pool.need_rediscover()
|
||
|
pool = self._slaves.get(service)
|
||
|
if pool:
|
||
|
pool.need_rediscover()
|
||
|
|
||
|
|
||
|
class ManagedPool(ConnectionsPool):
|
||
|
|
||
|
def __init__(self, sentinel, service, is_master,
|
||
|
db=None, password=None, encoding=None, parser=None,
|
||
|
*, minsize, maxsize, ssl=None, loop=None):
|
||
|
super().__init__(_NON_DISCOVERED,
|
||
|
db=db, password=password, encoding=encoding,
|
||
|
minsize=minsize, maxsize=maxsize, ssl=ssl,
|
||
|
parser=parser, loop=loop)
|
||
|
assert self._address is _NON_DISCOVERED
|
||
|
self._sentinel = sentinel
|
||
|
self._service = service
|
||
|
self._is_master = is_master
|
||
|
# self._discover_timeout = .2
|
||
|
|
||
|
@property
|
||
|
def address(self):
|
||
|
if self._address is _NON_DISCOVERED:
|
||
|
return
|
||
|
return self._address
|
||
|
|
||
|
def get_connection(self, command, args=()):
|
||
|
if self._address is _NON_DISCOVERED:
|
||
|
return None, _NON_DISCOVERED
|
||
|
return super().get_connection(command, args)
|
||
|
|
||
|
async def _create_new_connection(self, address):
|
||
|
if address is _NON_DISCOVERED:
|
||
|
# Perform service discovery.
|
||
|
# Returns Connection or raises error if no service can be found.
|
||
|
await self._do_clear() # make `clear` blocking
|
||
|
|
||
|
if self._is_master:
|
||
|
conn = await self._sentinel.discover_master(
|
||
|
self._service, timeout=self._sentinel.discover_timeout)
|
||
|
else:
|
||
|
conn = await self._sentinel.discover_slave(
|
||
|
self._service, timeout=self._sentinel.discover_timeout)
|
||
|
self._address = conn.address
|
||
|
sentinel_logger.debug("Discoverred new address %r for %s",
|
||
|
conn.address, self._service)
|
||
|
return conn
|
||
|
return await super()._create_new_connection(address)
|
||
|
|
||
|
def _drop_closed(self):
|
||
|
diff = len(self._pool)
|
||
|
super()._drop_closed()
|
||
|
diff -= len(self._pool)
|
||
|
if diff:
|
||
|
# closed connections were in pool:
|
||
|
# * reset address;
|
||
|
# * notify sentinel pool
|
||
|
sentinel_logger.debug(
|
||
|
"Dropped %d closed connnection(s); must rediscover", diff)
|
||
|
self._sentinel._need_rediscover(self._service)
|
||
|
|
||
|
async def acquire(self, command=None, args=()):
|
||
|
if self._address is _NON_DISCOVERED:
|
||
|
await self.clear()
|
||
|
return await super().acquire(command, args)
|
||
|
|
||
|
def release(self, conn):
|
||
|
was_closed = conn.closed
|
||
|
super().release(conn)
|
||
|
# if connection was closed while used and not by release()
|
||
|
if was_closed:
|
||
|
sentinel_logger.debug(
|
||
|
"Released closed connection; must rediscover")
|
||
|
self._sentinel._need_rediscover(self._service)
|
||
|
|
||
|
def need_rediscover(self):
|
||
|
self._address = _NON_DISCOVERED
|
||
|
|
||
|
|
||
|
def make_dict(plain_list):
|
||
|
it = iter(plain_list)
|
||
|
return dict(zip(it, it))
|
||
|
|
||
|
|
||
|
class DiscoverError(Exception):
|
||
|
"""Internal errors for masters/slaves discovery."""
|
||
|
|
||
|
|
||
|
class BadState(DiscoverError):
|
||
|
"""Bad master's / slave's state read from sentinel."""
|
||
|
|
||
|
|
||
|
class UnknownService(DiscoverError):
|
||
|
"""Service is not monitored by specific sentinel."""
|
||
|
|
||
|
|
||
|
class RoleMismatch(DiscoverError):
|
||
|
"""Service reported to have other Role."""
|