From c357cd5b4c99513a726890f85ef80c5f355f278d Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Mon, 18 Sep 2023 09:00:48 +0100 Subject: [PATCH] Adding a hook to ensure conversion records should get indexed. --- lib/store/cmd.py | 2 +- lib/windex/mr_cdx_pywb_job.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/store/cmd.py b/lib/store/cmd.py index b6e4dd8..5e7b554 100755 --- a/lib/store/cmd.py +++ b/lib/store/cmd.py @@ -49,7 +49,7 @@ def main(): parser_list.add_argument('path', type=str, help='The path to list.') # 'put' subcommand - upload a file or folder to the store: - parser_up = subparsers.add_parser('put', help='Put a local file into the store.') + parser_up = subparsers.add_parser('put', help='Put a local file into the store. If the remote path is a directory, the local path will be resolved relative to the remote path. If the remote path is a directory and the local path is absolute, then the local path will used as the absolute path on the remote service. Only one file can be uploaded at once.') parser_up.add_argument('-B', '--backup-and-replace', action='store_true', help='If the file already exists, move it aside using a dated backup file and replace it with the new file.') parser_up.add_argument('local_path', type=str, help='The local path to read.') parser_up.add_argument('path', type=str, help='The store path to write to.') diff --git a/lib/windex/mr_cdx_pywb_job.py b/lib/windex/mr_cdx_pywb_job.py index fff7061..510b85b 100644 --- a/lib/windex/mr_cdx_pywb_job.py +++ b/lib/windex/mr_cdx_pywb_job.py @@ -133,7 +133,13 @@ def mapper_raw(self, warc_path, warc_uri): # Using CDX11: # CDX N b a m s k r M S V g # com,example)/ 20170306040206 http://example.com/ text/html 200 G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK - - 1242 784 example.warc.gz - cdx11 = CDX11Indexer(inputs=[warc_path], output=cdx_file, cdx11=True, post_append=True) + cdx11 = CDX11Indexer( + inputs=[warc_path], + output=cdx_file, + cdx11=True, + post_append=True, # Append POST args to allow POST request indexing + records = ["response", "revisit", "resource", "metadata", "conversion"] # Make sure conversion records are indexed too + ) # The warc_path we get passed in is just the local temp filename. # From here, need to use the HDFS file URI instead and extract the path: @@ -183,6 +189,7 @@ def mapper_raw(self, warc_path, warc_uri): urlp = urlparse(url) host_key = f"{urlp.scheme}-{urlp.hostname}" extended_scheme_urls = extended_scheme_urls + 1 + # TODO Special handling of metadata:URL scheme that was used for video, replacing metadata with urn:embeds else: url_surt = parts[0] host_key = url_surt.split(")", 1)[0]