Skip to content

Commit

Permalink
Move to Selenium
Browse files Browse the repository at this point in the history
  • Loading branch information
stupidpupil committed Jan 1, 2025
1 parent 666c2db commit 8f0dc3a
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 4 deletions.
13 changes: 13 additions & 0 deletions .github/workflows/download_registers_of_interests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,19 @@ jobs:
- uses: r-lib/actions/setup-r@v2
- uses: r-lib/actions/setup-pandoc@v2

- name: Install Java 17
uses: actions/setup-java@v3
with:
distribution: adopt
java-version: '17'

- uses: nanasess/setup-chromedriver@v2

- name: Download and start Selenium
run: |
wget https://selenium-release.storage.googleapis.com/3.9/selenium-server-standalone-3.9.1.jar
java -jar selenium-server-standalone-3.9.1.jar &
- name: Query dependencies
run: |
install.packages('remotes')
Expand Down
12 changes: 12 additions & 0 deletions R/get_html_for_url.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
get_html_for_url <- function(url, sleep=0){

remDr <- get_selenium_session()

remDr$navigate(url)

if(sleep > 0){
Sys.sleep(sleep)
}

remDr$getPageSource() |> dplyr::first() |> xml2::read_html()
}
2 changes: 1 addition & 1 deletion R/get_info_from_ms_page_url.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
get_info_from_ms_page_url <- function(ms_page_url, ms_welsh_page_url=NULL){
ms_page <- read_html(ms_page_url)
ms_page <- get_html_for_url(ms_page_url)

ret_info <- list()

Expand Down
2 changes: 1 addition & 1 deletion R/get_interests_from_roi_page_url.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
get_interests_from_roi_page_url = function(roi_page_url){

roi_page <- read_html(roi_page_url)
roi_page <- get_html_for_url(roi_page_url)
roi_tables <- roi_page %>% html_nodes(".mgInterestsTable")

ret <- tibble(
Expand Down
2 changes: 1 addition & 1 deletion R/get_link_to_register_from_ms_page_url.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Manual fallback - generally you can just use https://business.senedd.wales/mgRofI.aspx?UID=
get_link_to_register_from_ms_page_url <- function(ms_page_url){
ms_page <- read_html(ms_page_url)
ms_page %>% html_node(xpath='//a[contains(text(), "View Register")]') %>% html_attr("href")
ms_page %>% get_html_for_url(xpath='//a[contains(text(), "View Register")]') %>% html_attr("href")
}
31 changes: 31 additions & 0 deletions R/get_selenium_session.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
get_selenium_session_impl <- function() {
sess <- NULL

function (force_new = FALSE){
if(!is.null(sess) && !force_new){
return(sess)
}

remDr <- RSelenium::remoteDriver(
remoteServerAddr = "localhost",
port = 4444L,
browserName = "chrome",
extraCapabilities = list(
chromeOptions = list(
args = list(
"--headless=new",
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"--disable-gpu",
"--disable-dev-shm-usage",
"--disable-extensions")))
)

remDr$open()

sess <<- remDr

return(sess)
}
}

get_selenium_session <- get_selenium_session_impl()
3 changes: 2 additions & 1 deletion R/get_senedd_members.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ get_senedd_members <- function(extra_info=FALSE){
}

try(
member_list_html <- read_html("https://senedd.wales/find-a-member-of-the-senedd/?VW=Table&PageSize=10000&Page=1&Culture=en-GB&IsSubSearch=False&IsPostcodeCrossConstituency=False&Postcode=&Name=&ShowAll=true&Region=&Constituency=&Constituency=&Constituency=&Constituency=&Constituency=&PartyFilterType=party&PoliticalParty=&PoliticalPartyGroup=&partyValueName=")
member_list_html <- get_html_for_url("https://senedd.wales/find-a-member-of-the-senedd/?isActiveMs=true", 0.5)
)
}

Expand Down Expand Up @@ -43,6 +43,7 @@ get_senedd_members <- function(extra_info=FALSE){
))
}


ret <- ret %>% mutate(
Constituency = if_else(ConstituencyOrRegion %in% senedd_constituencies()$SeneddConstituencyName, ConstituencyOrRegion, NA_character_),
Region = if_else(ConstituencyOrRegion %in% senedd_constituencies()$SeneddRegionName, ConstituencyOrRegion, NA_character_)
Expand Down

0 comments on commit 8f0dc3a

Please sign in to comment.