From 8f0dc3a4d561539d0a0ffb4af3ae62ed15a32296 Mon Sep 17 00:00:00 2001 From: Adam Watkins Date: Wed, 1 Jan 2025 12:24:20 +0000 Subject: [PATCH] Move to Selenium --- .../download_registers_of_interests.yaml | 13 ++++++++ R/get_html_for_url.R | 12 +++++++ R/get_info_from_ms_page_url.R | 2 +- R/get_interests_from_roi_page_url.R | 2 +- R/get_link_to_register_from_ms_page_url.R | 2 +- R/get_selenium_session.R | 31 +++++++++++++++++++ R/get_senedd_members.R | 3 +- 7 files changed, 61 insertions(+), 4 deletions(-) create mode 100644 R/get_html_for_url.R create mode 100644 R/get_selenium_session.R diff --git a/.github/workflows/download_registers_of_interests.yaml b/.github/workflows/download_registers_of_interests.yaml index 9fbaf5b..73a9f56 100644 --- a/.github/workflows/download_registers_of_interests.yaml +++ b/.github/workflows/download_registers_of_interests.yaml @@ -15,6 +15,19 @@ jobs: - uses: r-lib/actions/setup-r@v2 - uses: r-lib/actions/setup-pandoc@v2 + - name: Install Java 17 + uses: actions/setup-java@v3 + with: + distribution: adopt + java-version: '17' + + - uses: nanasess/setup-chromedriver@v2 + + - name: Download and start Selenium + run: | + wget https://selenium-release.storage.googleapis.com/3.9/selenium-server-standalone-3.9.1.jar + java -jar selenium-server-standalone-3.9.1.jar & + - name: Query dependencies run: | install.packages('remotes') diff --git a/R/get_html_for_url.R b/R/get_html_for_url.R new file mode 100644 index 0000000..429b6b6 --- /dev/null +++ b/R/get_html_for_url.R @@ -0,0 +1,12 @@ +get_html_for_url <- function(url, sleep=0){ + + remDr <- get_selenium_session() + + remDr$navigate(url) + + if(sleep > 0){ + Sys.sleep(sleep) + } + + remDr$getPageSource() |> dplyr::first() |> xml2::read_html() +} diff --git a/R/get_info_from_ms_page_url.R b/R/get_info_from_ms_page_url.R index 72a367d..45b2aa2 100644 --- a/R/get_info_from_ms_page_url.R +++ b/R/get_info_from_ms_page_url.R @@ -1,5 +1,5 @@ get_info_from_ms_page_url <- function(ms_page_url, ms_welsh_page_url=NULL){ - ms_page <- read_html(ms_page_url) + ms_page <- get_html_for_url(ms_page_url) ret_info <- list() diff --git a/R/get_interests_from_roi_page_url.R b/R/get_interests_from_roi_page_url.R index 2d7bd9e..3713315 100644 --- a/R/get_interests_from_roi_page_url.R +++ b/R/get_interests_from_roi_page_url.R @@ -1,6 +1,6 @@ get_interests_from_roi_page_url = function(roi_page_url){ - roi_page <- read_html(roi_page_url) + roi_page <- get_html_for_url(roi_page_url) roi_tables <- roi_page %>% html_nodes(".mgInterestsTable") ret <- tibble( diff --git a/R/get_link_to_register_from_ms_page_url.R b/R/get_link_to_register_from_ms_page_url.R index 87ef022..f34a095 100644 --- a/R/get_link_to_register_from_ms_page_url.R +++ b/R/get_link_to_register_from_ms_page_url.R @@ -1,5 +1,5 @@ # Manual fallback - generally you can just use https://business.senedd.wales/mgRofI.aspx?UID= get_link_to_register_from_ms_page_url <- function(ms_page_url){ ms_page <- read_html(ms_page_url) - ms_page %>% html_node(xpath='//a[contains(text(), "View Register")]') %>% html_attr("href") + ms_page %>% get_html_for_url(xpath='//a[contains(text(), "View Register")]') %>% html_attr("href") } \ No newline at end of file diff --git a/R/get_selenium_session.R b/R/get_selenium_session.R new file mode 100644 index 0000000..d9085d3 --- /dev/null +++ b/R/get_selenium_session.R @@ -0,0 +1,31 @@ +get_selenium_session_impl <- function() { + sess <- NULL + + function (force_new = FALSE){ + if(!is.null(sess) && !force_new){ + return(sess) + } + + remDr <- RSelenium::remoteDriver( + remoteServerAddr = "localhost", + port = 4444L, + browserName = "chrome", + extraCapabilities = list( + chromeOptions = list( + args = list( + "--headless=new", + "--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", + "--disable-gpu", + "--disable-dev-shm-usage", + "--disable-extensions"))) + ) + + remDr$open() + + sess <<- remDr + + return(sess) + } +} + +get_selenium_session <- get_selenium_session_impl() diff --git a/R/get_senedd_members.R b/R/get_senedd_members.R index 8c6650f..51f9ba5 100644 --- a/R/get_senedd_members.R +++ b/R/get_senedd_members.R @@ -13,7 +13,7 @@ get_senedd_members <- function(extra_info=FALSE){ } try( - member_list_html <- read_html("https://senedd.wales/find-a-member-of-the-senedd/?VW=Table&PageSize=10000&Page=1&Culture=en-GB&IsSubSearch=False&IsPostcodeCrossConstituency=False&Postcode=&Name=&ShowAll=true&Region=&Constituency=&Constituency=&Constituency=&Constituency=&Constituency=&PartyFilterType=party&PoliticalParty=&PoliticalPartyGroup=&partyValueName=") + member_list_html <- get_html_for_url("https://senedd.wales/find-a-member-of-the-senedd/?isActiveMs=true", 0.5) ) } @@ -43,6 +43,7 @@ get_senedd_members <- function(extra_info=FALSE){ )) } + ret <- ret %>% mutate( Constituency = if_else(ConstituencyOrRegion %in% senedd_constituencies()$SeneddConstituencyName, ConstituencyOrRegion, NA_character_), Region = if_else(ConstituencyOrRegion %in% senedd_constituencies()$SeneddRegionName, ConstituencyOrRegion, NA_character_)