kiwisdr_com-parse

#! /usr/bin/perl
# KiwiSDR.com receiver list parser for dyatlov map maker
# Copyright 2017, 2020 Pierre Ynard
# Licensed under GPLv3+
#
# This script reads the http://kiwisdr.com/public/ listing HTML from the
# standard input, and writes the corresponding extracted javascript data
# to the standard output.
#
# The output is not JSON, but a ready-to-execute javascript file; so
# data needs to be all the more carefully validated to prevent malicious
# injection.

use strict;
use warnings;

sub json_escape {
	my $field = shift;
	$field =~ s/["\r\n\\]/\\$&/g;
	return $field;
}

print "// KiwiSDR.com receiver list for dyatlov map maker\n// Automatically generated from http://kiwisdr.com/public/\n";

# Extract source data timestamp
my $limit = -1;
while (<STDIN>) {
	# This is where the timestamp is currently found
	if (/<div class='id-text-container'>/) {
		if (defined($_ = <STDIN>) && /(.*)/) {
			print "// KiwiSDR.com data timestamp: $1\n";
		} else {
			print STDERR "Data timestamp extraction failed.\n";
		}
		last;
	}
	# Safety nets: abort if going too far without finding the timestamp.
	# This tag means that the receiver listing has already been reached.
	elsif (/<div class='cl-entry'>/ || $limit == 0) {
		print STDERR "Data timestamp extraction failed.\n";
		last;
	}

	# Safety nets: abort if the timestamp can't be found within
	# so many lines into <body>
	if ($limit < 0 && /<body[ >]/) {
		$limit = 20;
	} elsif ($limit > 0) {
		$limit--;
	}
}

print "// File generation timestamp: ".gmtime()."\n\nvar kiwisdr_com =\n[\n";

while (<STDIN>) {

	# This marks the beginning of a section corresponding to one receiver
	if (/<div class='cl-info'>/) {
		print "\t{\n";

		# Parse all the data fields of the receiver,
		# available inside HTML comments
		while (defined($_ = <STDIN>) && /<!-- ([\w]+)=(.*) -->/a) {
			print "\t\t\"$1\":\"".json_escape($2)."\",\n";
		}

		# Parse the URL of the receiver, available only as HTML
		if (defined($_ = <STDIN>) && />([^<]+)<\/a>/) {
			# It is unknown whether URLs are properly
			# XML-encoded, or not; if they were, XML entities
			# would need to be decoded here.
			print "\t\t\"url\":\"".json_escape($1)."\"\n";
		} else {
			print STDERR "URL extraction failed: ".($_ // "truncated input\n");
		}

		print "\t},\n";
	}
}

print "]\n;\n";