Skip to content

Commit

Permalink
feat: keep saving in file all observed domains with minimum observations
Browse files Browse the repository at this point in the history
  • Loading branch information
Guillaume NICOLAS committed Sep 2, 2024
1 parent 0bda124 commit 0aa8372
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 22 deletions.
51 changes: 29 additions & 22 deletions bin/automated-update.js
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ async function main() {

const observedDomainsFilename = `${__dirname}/../data/${dateStringHypens}-observed-domains.json`
const entityScriptingFilename = `${__dirname}/../data/${dateStringHypens}-entity-scripting.json`
const mostObservedDomainsFilename = `${__dirname}/../sql/most-observed-domains-query.sql`
const allObservedDomainsFilename = `${__dirname}/../sql/all-observed-domains-query.sql`
const entityPerPageFilename = `${__dirname}/../sql/entity-per-page.sql`

Expand All @@ -131,6 +132,10 @@ async function main() {
exitFn: () => process.exit(1),
})

const mostObservedDomainsQuery = getQueryForTable(
mostObservedDomainsFilename,
dateStringUnderscore
)
const allObservedDomainsQuery = getQueryForTable(allObservedDomainsFilename, dateStringUnderscore)
const entityPerPageQuery = getQueryForTable(entityPerPageFilename, dateStringUnderscore)

Expand All @@ -142,24 +147,21 @@ async function main() {

const start = Date.now()

const domainEntityMapping = entities.reduce((array, {name, domains}) => {
return array.concat(domains.map(domain => ({name, domain})))
}, [])

const resultsStream = await getQueryResultStream(allObservedDomainsQuery, {
entities_string: JSON.stringify(domainEntityMapping),
})

// Observed domain json file pipe
//1. Get and write in 'observed-domains' json file domains observed more than 50 times
let observedDomainsNbRows = 0
const observedDomainsFileWriterStream = fs.createWriteStream(observedDomainsFilename)
resultsStream
// stringify observed domain json (with json array prefix based on row index)
.pipe(getJSONStringTransformer(observedDomainsNbRows))
// write to observed-domains json file
.pipe(observedDomainsFileWriterStream)
await getQueryResultStream(mostObservedDomainsQuery).then(stream => {
stream
// stringify observed domain json (with json array prefix based on row index)
.pipe(getJSONStringTransformer(observedDomainsNbRows))
// write to observed-domains json file
.pipe(observedDomainsFileWriterStream)
})

// Observed domain entity mapping table pipe
//2. Get and write in 'third_party_web' table all observed domains mapped to entity observed at least 50 times
const domainEntityMapping = entities.reduce((array, {name, domains}) => {
return array.concat(domains.map(domain => ({name, domain})))
}, [])
const thirdPartyWebTableWriterStream = new BigQuery()
.dataset('third_party_web')
.table(dateStringUnderscore)
Expand All @@ -170,13 +172,18 @@ async function main() {
{name: 'category', type: 'STRING'},
],
})
resultsStream
// map observed domain to entity
.pipe(EntityCanonicalDomainTransformer)
// stringify json
.pipe(getJSONStringTransformer())
// write to thrid_party_web table
.pipe(thirdPartyWebTableWriterStream)

await getQueryResultStream(allObservedDomainsQuery, {
entities_string: JSON.stringify(domainEntityMapping),
}).then(stream => {
stream
// map observed domain to entity
.pipe(EntityCanonicalDomainTransformer)
// stringify json
.pipe(getJSONStringTransformer())
// write to thrid_party_web table
.pipe(thirdPartyWebTableWriterStream)
})

// Wait both streams to finish
await resolveOnFinished([observedDomainsFileWriterStream, thirdPartyWebTableWriterStream])
Expand Down
21 changes: 21 additions & 0 deletions sql/most-observed-domains-query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
SELECT
domain,
COUNT(0) AS totalOccurrences
FROM
(
SELECT
page,
NET.HOST(url) AS domain,
COUNT(0) AS totalOccurrences
FROM
`httparchive.requests.2022_01_01_mobile`
GROUP BY
page,
domain
)
GROUP BY
domain
HAVING
totalOccurrences >= 50
ORDER BY
totalOccurrences DESC

0 comments on commit 0aa8372

Please sign in to comment.