Skip to content

Commit

Permalink
adapted changes of selectors in tests and examples
Browse files Browse the repository at this point in the history
  • Loading branch information
schochastics committed Oct 13, 2024
1 parent 96fe77d commit b3789b5
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 35 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: webbotparseR
Title: Parse html files containing search engine results
Version: 0.0.1.9000
Version: 0.1.0.9000
Authors@R:
c(person("David", "Schoch", , "david@schochastics.net", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2952-4812")),
person("Chung-hong", "Chan", ,"chainsawtiney@gmail.com", role = c("aut"), comment = c(ORCID = "0000-0002-6232-7530")))
Expand Down
4 changes: 2 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# webbotparseR 0.0.1.9000
# webbotparseR 0.1.0.9000

* added partial new google text selector
* added new google text selector (all example data needs `selectors="ver1"` now)

# webbotparseR 0.0.0.9000

Expand Down
20 changes: 11 additions & 9 deletions R/base64.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,18 @@
#' @export
#' @examples
#' \dontrun{
#' data_uri <- paste0("data:image/png;base64,",
#' base64enc::base64encode(system.file('logo.png', package = "webbotparseR")))
#' base64_to_img(data_uri,"logo")
#' data_uri <- paste0(
#' "data:image/png;base64,",
#' base64enc::base64encode(system.file("logo.png", package = "webbotparseR"))
#' )
#' base64_to_img(data_uri, "logo")
#' }
base64_to_img <- function(data_uri,slug){
img_type <- sub("data:image/([a-zA-Z]+);base64,.*","\\1",data_uri)
img64 <- sub("data:image/[a-zA-Z]+;base64,","",data_uri)
img_file <- paste0(slug,".",img_type)
conn <- file(img_file,"wb")
base64enc::base64decode(what = img64,output = conn)
base64_to_img <- function(data_uri, slug) {
img_type <- sub("data:image/([a-zA-Z]+);base64,.*", "\\1", data_uri)
img64 <- sub("data:image/[a-zA-Z]+;base64,", "", data_uri)
img_file <- paste0(slug, ".", img_type)
conn <- file(img_file, "wb")
base64enc::base64decode(what = img64, output = conn)
close(conn)
invisible(img64)
}
2 changes: 1 addition & 1 deletion R/parser.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#' package = "webbotparseR"
#' )
#'
#' parse_search_results(search_html, engine = "google text")
#' parse_search_results(search_html, engine = "google text", selectors = "ver1")
#' @export
parse_search_results <- function(path, engine, selectors = "latest") {
current_selectors <- .get_selectors(selectors)
Expand Down
39 changes: 20 additions & 19 deletions tests/testthat/test-parser.R
Original file line number Diff line number Diff line change
@@ -1,38 +1,39 @@
sel <- "ver1"
test_that("google text latest selector", {
expect_no_error(output <- parse_search_results("../testdata/www.google.com_query_text_2023-03-16_08_16_05.html",engine = "google text"))
expect_s3_class(output,"tbl_df")
expect_no_error(output <- parse_search_results("../testdata/www.google.com_query_text_2023-03-16_08_16_05.html", engine = "google text", selectors = sel))
expect_s3_class(output, "tbl_df")
})

test_that("google news latest selector", {
expect_no_error(output <- parse_search_results("../testdata/www.google.com_query_news_2023-03-16_08_15_05.html",engine = "google news"))
expect_s3_class(output,"tbl_df")
expect_no_error(output <- parse_search_results("../testdata/www.google.com_query_news_2023-03-16_08_15_05.html", engine = "google news", selectors = sel))
expect_s3_class(output, "tbl_df")
})

test_that("ddg text latest selector", {
expect_no_error(output <- parse_search_results("../testdata/duckduckgo.com_query_text_2023-03-16_09_17_19.html",engine = "duckduckgo text"))
expect_s3_class(output,"tbl_df")
expect_no_error(output <- parse_search_results("../testdata/duckduckgo.com_query_text_2023-03-16_09_17_19.html", engine = "duckduckgo text", selectors = sel))
expect_s3_class(output, "tbl_df")
})

test_that("ddg news latest selector", {
expect_no_error(output <- parse_search_results("../testdata/duckduckgo.com_query_news_2023-03-16_09_15_05.html",engine = "duckduckgo text"))
expect_s3_class(output,"tbl_df")
expect_no_error(output <- parse_search_results("../testdata/duckduckgo.com_query_news_2023-03-16_09_15_05.html", engine = "duckduckgo text", selectors = sel))
expect_s3_class(output, "tbl_df")
})

test_that("yahoo text latest selector", {
expect_no_error(output <- parse_search_results("../testdata/us.yahoo.com_query_text_2023-03-20_07_57_12.html",engine = "yahoo text"))
expect_s3_class(output,"tbl_df")
expect_no_error(output <- parse_search_results("../testdata/us.yahoo.com_query_text_2023-03-20_07_57_12.html", engine = "yahoo text", selectors = sel))
expect_s3_class(output, "tbl_df")
})

test_that("metadata reading",{
test_that("metadata reading", {
expect_no_error(meta <- parse_metadata(".../testdata/www.google.com_query_text_2023-03-16_08_16_11.html"))
expect_equal(meta$search_engine,"www.google.com")
expect_equal(meta$type,"text")
expect_equal(meta$query,"query")
expect_equal(as.Date(meta$date),as.Date("2023-03-16"))
expect_equal(meta$search_engine, "www.google.com")
expect_equal(meta$type, "text")
expect_equal(meta$query, "query")
expect_equal(as.Date(meta$date), as.Date("2023-03-16"))
})

test_that("folder parsing",{
expect_no_error(output <- parse_search_results("../testdata", engine = "google text"))
expect_error(parse_search_results("../testdata/", engine = "google images"))
expect_error(parse_search_results("void", engine = "google text"))
test_that("folder parsing", {
expect_no_error(output <- parse_search_results("../testdata", engine = "google text", selectors = sel))
expect_error(parse_search_results("../testdata/", engine = "google images", selectors = sel))
expect_error(parse_search_results("void", engine = "google text", selectors = sel))
})
7 changes: 4 additions & 3 deletions tests/testthat/test-selector_versions.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
test_that(".is_webbot_selectors", {
expect_true(.is_webbot_selectors(selectors_library$get("ver1")))
expect_false(.is_webbot_selectors(list(a = c(1,2,3))))
expect_false(.is_webbot_selectors(list(a = c(1, 2, 3))))
})

test_that(".get_selectors", {
Expand All @@ -25,7 +25,8 @@ test_that(".get_selectors really working not NULL", {
fake_library$set("ver2", fake_selectors_v2)
fake_versions <- data.frame(
version = c("ver1", "ver2"),
snapshot_date = c(as.Date("2023-03-17"), as.Date("2047-07-01")))
snapshot_date = c(as.Date("2023-03-17"), as.Date("2047-07-01"))
)
res <- .get_selectors("ver1", lib = fake_library, vers = fake_versions)
expect_equal(res, fake_library$get("ver1"))
expect_equal(class(res$`google images`), "list")
Expand All @@ -40,7 +41,7 @@ test_that(".get_selectors really working not NULL", {
})

test_that("integration", {
res1 <- parse_search_results("../testdata/www.google.com_query_text_2023-03-16_08_16_05.html", "google text", "latest")
res1 <- parse_search_results("../testdata/www.google.com_query_text_2023-03-16_08_16_05.html", "google text", "ver1")
res2 <- parse_search_results("../testdata/www.google.com_query_text_2023-03-16_08_16_05.html", "google text", "ver1")
res3 <- parse_search_results("../testdata/www.google.com_query_text_2023-03-16_08_16_05.html", "google text", selectors_library$get("ver1"))
expect_equal(res1, res2)
Expand Down

0 comments on commit b3789b5

Please sign in to comment.