Netflix · saikonen · Oct 30, 2023 · Dec 30, 2022 · Oct 4, 2022 · Feb 15, 2023
diff --git a/run_goose.py b/run_goose.py
@@ -49,6 +49,11 @@ def main():
     parser = argparse.ArgumentParser(description="Run goose migrations")
     parser.add_argument("--only-if-empty-db", default=False, action="store_true")
     parser.add_argument("--wait", type=int, default=30, help="Wait for connection for X seconds")
+    parser.add_argument("--goose-path", type=str, default="/go/bin/goose", help="Path for the goose binary")
+    parser.add_argument("--migration-files-path",
+                        type=str,
+                        default="/root/services/migration_service/migration_files/",
+                        help="Path to migration files")
     args = parser.parse_args()
 
     db_connection_string = "postgresql://{}:{}@{}:{}/{}?sslmode=disable".format(
@@ -72,9 +77,9 @@ def main():
 
     p = Popen(
         [
-            "goose",
+            args.goose_path,
             "-dir",
-            "/root/services/migration_service/migration_files/",
+            args.migration_files_path,
             "postgres",
             db_connection_string,
             "up",

diff --git a/services/data/postgres_async_db.py b/services/data/postgres_async_db.py
@@ -7,13 +7,13 @@
 import math
 import re
 import time
-from services.utils import logging
+from services.utils import logging, DBType
 from typing import List, Tuple
 
 from .db_utils import DBResponse, DBPagination, aiopg_exception_handling, \
     get_db_ts_epoch_str, translate_run_key, translate_task_key, new_heartbeat_ts
 from .models import FlowRow, RunRow, StepRow, TaskRow, MetadataRow, ArtifactRow
-from services.utils import DBConfiguration
+from services.utils import DBConfiguration, USE_SEPARATE_READER_POOL
 
 from services.data.service_configs import max_connection_retires, \
     connection_retry_wait_time_seconds
@@ -49,6 +49,7 @@ class _AsyncPostgresDB(object):
     metadata_table_postgres = None
 
     pool = None
+    reader_pool = None
     db_conf: DBConfiguration = None
 
     def __init__(self, name='global'):
@@ -77,21 +78,42 @@ async def _init(self, db_conf: DBConfiguration, create_triggers=DB_TRIGGER_CREAT
         for i in range(retries):
             try:
                 self.pool = await aiopg.create_pool(
-                    db_conf.dsn,
+                    db_conf.get_dsn(),
                     minsize=db_conf.pool_min,
                     maxsize=db_conf.pool_max,
                     timeout=db_conf.timeout,
                     pool_recycle=10 * db_conf.timeout,
                     echo=AIOPG_ECHO)
 
+                self.reader_pool = await aiopg.create_pool(
+                    db_conf.get_dsn(type=DBType.READER),
+                    minsize=db_conf.pool_min,
+                    maxsize=db_conf.pool_max,
+                    timeout=db_conf.timeout,
+                    pool_recycle=10 * db_conf.timeout,
+                    echo=AIOPG_ECHO) if USE_SEPARATE_READER_POOL == "1" else self.pool
+
                 for table in self.tables:
                     await table._init(create_triggers=create_triggers)
 
-                self.logger.info(
-                    "Connection established.\n"
-                    "   Pool min: {pool_min} max: {pool_max}\n".format(
-                        pool_min=self.pool.minsize,
-                        pool_max=self.pool.maxsize))
+                if USE_SEPARATE_READER_POOL == "1":
+                    self.logger.info(
+                        "Writer Connection established.\n"
+                        "   Pool min: {pool_min} max: {pool_max}\n".format(
+                            pool_min=self.pool.minsize,
+                            pool_max=self.pool.maxsize))
+
+                    self.logger.info(
+                        "Reader Connection established.\n"
+                        "   Pool min: {pool_min} max: {pool_max}\n".format(
+                            pool_min=self.reader_pool.minsize,
+                            pool_max=self.reader_pool.maxsize))
+                else:
+                    self.logger.info(
+                        "Connection established.\n"
+                        "   Pool min: {pool_min} max: {pool_max}\n".format(
+                            pool_min=self.pool.minsize,
+                            pool_max=self.pool.maxsize))
 
                 break  # Break the retry loop
             except Exception as e:
@@ -211,15 +233,20 @@ async def find_records(self, conditions: List[str] = None, values=[], fetch_sing
 
     async def execute_sql(self, select_sql: str, values=[], fetch_single=False,
                           expanded=False, limit: int = 0, offset: int = 0,
-                          cur: aiopg.Cursor = None) -> Tuple[DBResponse, DBPagination]:
+                          cur: aiopg.Cursor = None, serialize: bool = True) -> Tuple[DBResponse, DBPagination]:
         async def _execute_on_cursor(_cur):
             await _cur.execute(select_sql, values)
 
             rows = []
             records = await _cur.fetchall()
-            for record in records:
-                row = self._row_type(**record)  # pylint: disable=not-callable
-                rows.append(row.serialize(expanded))
+            if serialize:
+                for record in records:
+                    # pylint-initial-ignore: Lack of __init__ makes this too hard for pylint
+                    # pylint: disable=not-callable
+                    row = self._row_type(**record)
+                    rows.append(row.serialize(expanded))
+            else:
+                rows = records
 
             count = len(rows)
 
@@ -231,18 +258,21 @@ async def _execute_on_cursor(_cur):
                 count=count,
                 page=math.floor(int(offset) / max(int(limit), 1)) + 1,
             )
+            _cur.close()
             return body, pagination
-        if cur:
-            # if we are using the passed in cursor, we allow any errors to be managed by cursor owner
-            body, pagination = await _execute_on_cursor(cur)
-            return DBResponse(response_code=200, body=body), pagination
+
         try:
-            with (await self.db.pool.cursor(
-                    cursor_factory=psycopg2.extras.DictCursor
-            )) as cur:
+            if cur:
+                # if we are using the passed in cursor, we allow any errors to be managed by cursor owner
                 body, pagination = await _execute_on_cursor(cur)
-                cur.close()  # unsure if needed, leaving in there for safety
-            return DBResponse(response_code=200, body=body), pagination
+                return DBResponse(response_code=200, body=body), pagination
+            else:
+                db_pool = self.db.reader_pool if USE_SEPARATE_READER_POOL == "1" else self.db.pool
+                with (await db_pool.cursor(
+                        cursor_factory=psycopg2.extras.DictCursor
+                )) as cur:
+                    body, pagination = await _execute_on_cursor(cur)
+                    return DBResponse(response_code=200, body=body), pagination
         except IndexError as error:
             return aiopg_exception_handling(error), None
         except (Exception, psycopg2.DatabaseError) as error:

diff --git a/services/metadata_service/tests/integration_tests/utils.py b/services/metadata_service/tests/integration_tests/utils.py
@@ -2,6 +2,8 @@
 from typing import Callable
 
 import pytest
+import psycopg2
+import psycopg2.extras
 from aiohttp import web
 from services.data.postgres_async_db import AsyncPostgresDB
 from services.utils.tests import get_test_dbconf
@@ -67,8 +69,11 @@ async def clean_db(db: AsyncPostgresDB):
         db.run_table_postgres,
         db.flow_table_postgres
     ]
-    for table in tables:
-        await table.execute_sql(select_sql="DELETE FROM {}".format(table.table_name))
+    with (await db.pool.cursor(
+            cursor_factory=psycopg2.extras.DictCursor
+    )) as cur:
+        for table in tables:
+            await table.execute_sql(select_sql="DELETE FROM {}".format(table.table_name), cur=cur)
 
 
 @pytest.fixture

diff --git a/services/migration_service/api/admin.py b/services/migration_service/api/admin.py
@@ -69,7 +69,7 @@ async def upgrade(self, request):
                 description: could not upgrade
         """
         goose_version_cmd = make_goose_migration_template(
-            db_conf.connection_string_url,
+            db_conf.connection_string_url(),
             "up"
         )
         p = Popen(goose_version_cmd, shell=True,

diff --git a/services/migration_service/api/utils.py b/services/migration_service/api/utils.py
@@ -25,7 +25,7 @@ def get_unapplied_migrations(current_version):
     @staticmethod
     async def get_goose_version():
         # if tables exist but goose doesn't find version table then
-        goose_version_cmd = make_goose_template(db_conf.connection_string_url, 'version')
+        goose_version_cmd = make_goose_template(db_conf.connection_string_url(), 'version')
 
         p = Popen(goose_version_cmd, stdout=PIPE, stderr=PIPE, shell=True,
                   close_fds=True)
@@ -55,7 +55,7 @@ async def get_latest_compatible_version():
             return version_dict[version]
         else:
             print("Running initial migration..", file=sys.stderr)
-            goose_version_cmd = make_goose_migration_template(db_conf.connection_string_url, 'up')
+            goose_version_cmd = make_goose_migration_template(db_conf.connection_string_url(), 'up')
             p = Popen(goose_version_cmd, shell=True,
                       close_fds=True)
             if p.wait() != 0:
@@ -65,7 +65,7 @@ async def get_latest_compatible_version():
     @staticmethod
     async def is_migration_in_progress():
         goose_version_cmd = make_goose_template(
-            db_conf.connection_string_url, "status"
+            db_conf.connection_string_url(), "status"
         )
 
         p = Popen(goose_version_cmd, stdout=PIPE, stderr=PIPE, shell=True,

diff --git a/services/migration_service/data/postgres_async_db.py b/services/migration_service/data/postgres_async_db.py
@@ -40,7 +40,7 @@ async def _init(self, db_conf: DBConfiguration):
         retries = 3
         for i in range(retries):
             try:
-                self.pool = await aiopg.create_pool(db_conf.dsn, timeout=db_conf.timeout)
+                self.pool = await aiopg.create_pool(db_conf.get_dsn(), timeout=db_conf.timeout)
             except Exception as e:
                 print("printing connection exception: " + str(e))
                 if retries - i < 1:

diff --git a/services/ui_backend_service/README.md b/services/ui_backend_service/README.md
@@ -24,7 +24,7 @@ The UI service module is `services.ui_backend_service.ui_server`:
 > $ /opt/latest/bin/python3 -m services.ui_backend_service.ui_server
 > ```
 
-Below is an Docker run command for running UI Service exposed at port 8083:
+Below is a Docker run command for running UI Service exposed at port 8083:
 
 > ```sh
 > $ docker run \
@@ -60,7 +60,7 @@ The service depends on the following Environment Variables to be set:
 - `MF_METADATA_DB_PSWD` [defaults to postgres]
 - `MF_METADATA_DB_NAME` [defaults to postgres]
 
-Optionally you can also overrider the host and port the service runs on:
+Optionally you can also override the host and port the service runs on:
 
 - `MF_UI_METADATA_PORT` [defaults to 8083]
 - `MF_UI_METADATA_HOST` [defaults to 0.0.0.0]

diff --git a/services/ui_backend_service/data/cache/client/cache_async_client.py b/services/ui_backend_service/data/cache/client/cache_async_client.py
@@ -8,8 +8,8 @@
 
 from services.utils import logging
 
-OP_WORKER_CREATE = "worker_create"
-OP_WORKER_TERMINATE = "worker_terminate"
+OP_WORKER_CREATE = 'worker_create'
+OP_WORKER_TERMINATE = 'worker_terminate'
 
 WAIT_FREQUENCY = 0.2
 HEARTBEAT_FREQUENCY = 1
@@ -20,16 +20,20 @@ class CacheAsyncClient(CacheClient):
     _restart_requested = False
 
     async def start_server(self, cmdline, env):
-        self.logger = logging.getLogger(
-            "CacheAsyncClient:{root}".format(root=self._root)
+        self.logger = logging.getLogger("CacheAsyncClient:{root}".format(root=self._root))
+
+        self._proc = await asyncio.create_subprocess_exec(*cmdline,
+                                                          env=env,
+                                                          stdin=PIPE,
+                                                          stdout=PIPE,
+                                                          stderr=STDOUT,
+                                                          limit=1024000)  # 1024KB
+
+        asyncio.gather(
+            self._heartbeat(),
+            self.read_stdout()
         )
 
-        self._proc = await asyncio.create_subprocess_exec(
-            *cmdline, env=env, stdin=PIPE, stdout=PIPE, stderr=STDOUT, limit=1024000
-        )  # 1024KB
-
-        asyncio.gather(self._heartbeat(), self.read_stdout())
-
     async def _read_pipe(self, src):
         while self._is_alive:
             line = await src.readline()
@@ -49,15 +53,14 @@ async def read_message(self, line: str):
             message = json.loads(line)
             if self.logger.isEnabledFor(logging.INFO):
                 self.logger.info(message)
-            if message["op"] == OP_WORKER_CREATE:
-                self.pending_requests.add(message["stream_key"])
-            elif message["op"] == OP_WORKER_TERMINATE:
-                self.pending_requests.remove(message["stream_key"])
+            if message['op'] == OP_WORKER_CREATE:
+                self.pending_requests.add(message['stream_key'])
+            elif message['op'] == OP_WORKER_TERMINATE:
+                self.pending_requests.remove(message['stream_key'])
 
             if self.logger.isEnabledFor(logging.INFO):
-                self.logger.info(
-                    "Pending stream keys: {}".format(list(self.pending_requests))
-                )
+                self.logger.info("Pending stream keys: {}".format(
+                    list(self.pending_requests)))
         except JSONDecodeError as ex:
             if self.logger.isEnabledFor(logging.INFO):
                 self.logger.info("Message: {}".format(line))

diff --git a/services/ui_backend_service/data/db/postgres_async_db.py b/services/ui_backend_service/data/db/postgres_async_db.py
@@ -35,6 +35,7 @@ class AsyncPostgresDB(BaseAsyncPostgresDB):
     metadata_table_postgres = None
 
     pool = None
+    reader_pool = None
     db_conf: DBConfiguration = None
 
     def __init__(self, name='global'):

diff --git a/services/ui_backend_service/data/db/tables/base.py b/services/ui_backend_service/data/db/tables/base.py
@@ -269,56 +269,7 @@ async def benchmark_sql(
             self.db.logger.exception("Query Benchmarking failed")
             return None
 
-    async def execute_sql(
-        self,
-        select_sql: str,
-        values=[],
-        fetch_single=False,
-        expanded=False,
-        limit: int = 0,
-        offset: int = 0,
-        serialize: bool = True,
-    ) -> Tuple[DBResponse, DBPagination]:
-        try:
-            with (
-                await self.db.pool.cursor(cursor_factory=psycopg2.extras.DictCursor)
-            ) as cur:
-                await cur.execute(select_sql, values)
-
-                rows = []
-                records = await cur.fetchall()
-                if serialize:
-                    for record in records:
-                        # pylint-initial-ignore: Lack of __init__ makes this too hard for pylint
-                        # pylint: disable=not-callable
-                        row = self._row_type(**record)
-                        rows.append(row.serialize(expanded))
-                else:
-                    rows = records
-
-                count = len(rows)
-
-                # Will raise IndexError in case fetch_single=True and there's no results
-                body = rows[0] if fetch_single else rows
-
-                pagination = DBPagination(
-                    limit=limit,
-                    offset=offset,
-                    count=count,
-                    page=math.floor(int(offset) / max(int(limit), 1)) + 1,
-                )
-
-                cur.close()
-                return DBResponse(response_code=200, body=body), pagination
-        except IndexError as error:
-            return aiopg_exception_handling(error), None
-        except (Exception, psycopg2.DatabaseError) as error:
-            self.db.logger.exception("Exception occured")
-            return aiopg_exception_handling(error), None
-
-    async def get_tags(
-        self, conditions: List[str] = None, values=[], limit: int = 0, offset: int = 0
-    ):
+    async def get_tags(self, conditions: List[str] = None, values=[], limit: int = 0, offset: int = 0):
         sql_template = """
         SELECT DISTINCT tag
         FROM (

diff --git a/services/ui_backend_service/docs/environment.md b/services/ui_backend_service/docs/environment.md
@@ -186,3 +186,28 @@ The `MF_LOG_LOAD_POLICY` environment variable restricts the amount of log conten
 ## Card content restriction
 
 The `MF_CARD_LOAD_POLICY` (default `full`) environment variable can be set to `blurb_only` to return a Python code snippet to access card using Metaflow client, instead of loading actual HTML card payload.
+
+
+## Scaling reads using read replicas
+
+Databases such as [Amazon Aurora](https://aws.amazon.com/rds/aurora/) provide 
+[read replicas](https://aws.amazon.com/rds/features/read-replicas/) that make it easy to elastically scale beyond
+the capacity constraints of single database instance for heavy read workloads. You are able to separate out the reads
+and the writes of this application by setting the following two environment variables:
+
+>```
+> USE_SEPARATE_READER_POOL = 1                               
+> MF_METADATA_DB_READ_REPLICA_HOST = <READ_REPLICA_ENDPOINT>
+>```
+
+As the name suggests, the `USE_SEPARATE_READER_POOL` variable creates a separate read pool with the same 
+min/max pool size as the writer pool. It is also required to set this variable `MF_METADATA_DB_READ_REPLICA_HOST` to 
+point to the read replica endpoint that is typically a load balancer in front of all the database's read replicas.
+
+### Accounting for eventual consistency
+
+When a read replica is created, there is a lag between the time a transaction is committed to the writer instance and
+the time when the newly written data is available in the read replica. In Amazon Aurora, this [lag is usually much less
+than 100ms](https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/Aurora.Replication.html) because the replicas
+share the same underlying storage layer as the writer instance thereby avoiding the need to copy data into the replica
+nodes. This Metaflow UI service application is read heavy and hence is a great candidate for scaling reads using this model.