From 88ec96131f8fc8b3b22a6b57802654414a248399 Mon Sep 17 00:00:00 2001 From: Zeke Gabrielse Date: Fri, 8 Nov 2024 10:49:13 -0600 Subject: [PATCH] add enumeratorio to improve memory efficiency --- app/workers/process_docker_image_worker.rb | 29 +++++++++++---- config/initializers/enumerator_io.rb | 3 ++ config/initializers/zietwork.rb | 1 + lib/enumerator_io.rb | 37 +++++++++++++++++++ .../process_docker_image_worker_spec.rb | 19 ++++++++-- 5 files changed, 78 insertions(+), 11 deletions(-) create mode 100644 config/initializers/enumerator_io.rb create mode 100644 lib/enumerator_io.rb diff --git a/app/workers/process_docker_image_worker.rb b/app/workers/process_docker_image_worker.rb index 3c65c426dd..78f3f76362 100644 --- a/app/workers/process_docker_image_worker.rb +++ b/app/workers/process_docker_image_worker.rb @@ -20,10 +20,19 @@ def perform(artifact_id) raise ImageNotAcceptableError, 'unacceptable filesize' end - # download the image tarball + # download the image tarball in chunks to reduce memory footprint client = artifact.client - tar = client.get_object(bucket: artifact.bucket, key: artifact.key) - .body + enum = Enumerator.new do |yielder| + client.get_object(bucket: artifact.bucket, key: artifact.key) do |chunk| + yielder << chunk + end + end + + # wrap the enumerator to provide an IO-like interface + tar = EnumeratorIO.new(enum) + + # keep a ref to the manifest + manifest = nil # unpack the package tarball unpack tar do |archive| @@ -39,15 +48,15 @@ def perform(artifact_id) entry.size > MAX_MANIFEST_SIZE # parse/validate and minify the manifest - json = JSON.parse(entry.read) - .to_json + content = JSON.parse(entry.read) + .to_json - ReleaseManifest.create!( + manifest = ReleaseManifest.create!( account_id: artifact.account_id, environment_id: artifact.environment_id, release_id: artifact.release_id, release_artifact_id: artifact.id, - content: json, + content:, ) in %r{^blobs/sha256/} if entry.file? key = artifact.key_for(entry.name) @@ -67,6 +76,10 @@ def perform(artifact_id) end end + # we can assume image tarball is invalid if there's no manifest + raise ImageNotAcceptableError, 'manifest is missing' if + manifest.nil? + artifact.update!(status: 'UPLOADED') BroadcastEventService.call( @@ -76,6 +89,8 @@ def perform(artifact_id) ) rescue ImageNotAcceptableError, ActiveRecord::RecordInvalid, + JSON::ParserError, + Minitar::UnexpectedEOF, Minitar::Error, IOError => e Keygen.logger.warn { "[workers.process-docker-image-worker] Error: #{e.class.name} - #{e.message}" } diff --git a/config/initializers/enumerator_io.rb b/config/initializers/enumerator_io.rb new file mode 100644 index 0000000000..99d4fa7009 --- /dev/null +++ b/config/initializers/enumerator_io.rb @@ -0,0 +1,3 @@ +# frozen_string_literal: true + +require_dependency Rails.root / 'lib' / 'enumerator_io' diff --git a/config/initializers/zietwork.rb b/config/initializers/zietwork.rb index d1db06934a..739d5779ae 100644 --- a/config/initializers/zietwork.rb +++ b/config/initializers/zietwork.rb @@ -3,6 +3,7 @@ Rails.autoloaders.each do |autoloader| # FIXME(ezekg) Should we rename these to follow conventions? autoloader.inflector.inflect( + "enumerator_io" => "EnumeratorIO", "digest_io" => "DigestIO", "jsonapi" => "JSONAPI", "ee" => "EE", diff --git a/lib/enumerator_io.rb b/lib/enumerator_io.rb new file mode 100644 index 0000000000..8e12096540 --- /dev/null +++ b/lib/enumerator_io.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +class EnumeratorIO + def initialize(enum) + @enum = enum + @buffer = ''.b + @eof = false + end + + def eof? = @eof && @buffer.empty? + def read(length = nil, buffer = nil) + buffer ||= +'' + buffer.clear + + # fill the buffer until it has enough data or EOF + while !@eof && (length.nil? || @buffer.bytesize < length) + begin + @buffer << @enum.next + rescue StopIteration + @eof = true + + break + end + end + + # extract the requested amount of data from the buffer + if length + buffer << @buffer.slice!(0, length) + else + buffer << @buffer + + @buffer.clear + end + + buffer.empty? ? nil : buffer + end +end diff --git a/spec/workers/process_docker_image_worker_spec.rb b/spec/workers/process_docker_image_worker_spec.rb index d267061fb3..4fac1fea8e 100644 --- a/spec/workers/process_docker_image_worker_spec.rb +++ b/spec/workers/process_docker_image_worker_spec.rb @@ -15,9 +15,12 @@ after { Sidekiq::Testing.fake! } context 'when artifact is a valid image' do - let(:image_tarball) { file_fixture('alpine-3.20.3.tar').open } + let(:image_fixture) { 'alpine-3.20.3.tar' } + let(:image_tarball) { file_fixture(image_fixture).open } let(:manifest_json) { - Minitar::Reader.open image_tarball do |archive| + tarball = file_fixture(image_fixture).open + + Minitar::Reader.open tarball do |archive| archive.find { _1.file? && _1.name in 'manifest.json' } .read end @@ -291,10 +294,18 @@ context 'when artifact filesize is unaccurate' do let(:artifact) { create(:artifact, :processing, filesize: 1.kilobyte, account:) } - let(:file) { file_fixture('large.tar.gz').open } + let(:tgz) { file_fixture('large.tar.gz').open } + let(:tar) { Zlib::GzipReader.new(tgz).read } before do - Aws.config = { s3: { stub_responses: { get_object: [{ body: file }] } } } + Aws.config = { + s3: { + stub_responses: { + head_object: [{ content_length: tar.size }], + get_object: [{ body: tar }], + }, + }, + } end it 'should not process file' do