Skip to content

Commit

Permalink
Adding a hook to ensure conversion records should get indexed.
Browse files Browse the repository at this point in the history
  • Loading branch information
anjackson committed Sep 18, 2023
1 parent e389424 commit c357cd5
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
2 changes: 1 addition & 1 deletion lib/store/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def main():
parser_list.add_argument('path', type=str, help='The path to list.')

# 'put' subcommand - upload a file or folder to the store:
parser_up = subparsers.add_parser('put', help='Put a local file into the store.')
parser_up = subparsers.add_parser('put', help='Put a local file into the store. If the remote path is a directory, the local path will be resolved relative to the remote path. If the remote path is a directory and the local path is absolute, then the local path will used as the absolute path on the remote service. Only one file can be uploaded at once.')
parser_up.add_argument('-B', '--backup-and-replace', action='store_true', help='If the file already exists, move it aside using a dated backup file and replace it with the new file.')
parser_up.add_argument('local_path', type=str, help='The local path to read.')
parser_up.add_argument('path', type=str, help='The store path to write to.')
Expand Down
9 changes: 8 additions & 1 deletion lib/windex/mr_cdx_pywb_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,13 @@ def mapper_raw(self, warc_path, warc_uri):
# Using CDX11:
# CDX N b a m s k r M S V g
# com,example)/ 20170306040206 http://example.com/ text/html 200 G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK - - 1242 784 example.warc.gz
cdx11 = CDX11Indexer(inputs=[warc_path], output=cdx_file, cdx11=True, post_append=True)
cdx11 = CDX11Indexer(
inputs=[warc_path],
output=cdx_file,
cdx11=True,
post_append=True, # Append POST args to allow POST request indexing
records = ["response", "revisit", "resource", "metadata", "conversion"] # Make sure conversion records are indexed too
)

# The warc_path we get passed in is just the local temp filename.
# From here, need to use the HDFS file URI instead and extract the path:
Expand Down Expand Up @@ -183,6 +189,7 @@ def mapper_raw(self, warc_path, warc_uri):
urlp = urlparse(url)
host_key = f"{urlp.scheme}-{urlp.hostname}"
extended_scheme_urls = extended_scheme_urls + 1
# TODO Special handling of metadata:URL scheme that was used for video, replacing metadata with urn:embeds
else:
url_surt = parts[0]
host_key = url_surt.split(")", 1)[0]
Expand Down

0 comments on commit c357cd5

Please sign in to comment.