From 1baf93ef65be4166a5bdb06f56ee67fe9e23b71a Mon Sep 17 00:00:00 2001 From: Nigel Nindo Date: Wed, 18 Oct 2023 09:52:48 +0300 Subject: [PATCH] Create general scroller --- .vscode/settings.json | 1 + src/core/scrapping/orbit/index.ts | 12 +++- .../scrapping/scrolling_scrapper/index.ts | 41 +++++------ .../build_scrapers/orbit/three_way/index.ts | 71 +++++++++++++++++++ 4 files changed, 103 insertions(+), 22 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 2d97b82..38ba6ae 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,7 @@ { "cSpell.words": [ "BETIKA", + "Millis", "SPORTPESA", "typeorm" ] diff --git a/src/core/scrapping/orbit/index.ts b/src/core/scrapping/orbit/index.ts index 3a818ac..4089b5d 100644 --- a/src/core/scrapping/orbit/index.ts +++ b/src/core/scrapping/orbit/index.ts @@ -5,7 +5,7 @@ import { OrbitProvider } from "../../../bet_providers/orbit"; import { RedisSingleton } from "../../../datastores/redis"; import { PuppeteerPageLoadPolicy } from "../../../utils/types/common"; import { Result } from "../../../utils/types/result_type"; -import { getHtmlForPage } from "../scrolling_scrapper"; +import { getHtmlForScrollingPage } from "../scrolling_scrapper"; const {logger} = getConfig(); @@ -47,7 +47,15 @@ export class OrbitScrapper extends BaseScrapper { logger.info("New request to fetch game events: ", metadata); - const getHtmlResult = await getHtmlForPage(browserInstance, game.url, PuppeteerPageLoadPolicy.LOAD); + const getHtmlResult = await getHtmlForScrollingPage( + browserInstance, + game.url, + PuppeteerPageLoadPolicy.LOAD, + ".biab_body.contentWrap", // scrollingElementSelector + 2000, // delayBeforeNextScrollAttemptMillis + 30, // numScrollAttempts + 150 // scrollDelta + ); if (getHtmlResult.result === "success") { logger.info("Successfully fetched html for url. ", metadata); diff --git a/src/core/scrapping/scrolling_scrapper/index.ts b/src/core/scrapping/scrolling_scrapper/index.ts index ab5bb35..3d481a7 100644 --- a/src/core/scrapping/scrolling_scrapper/index.ts +++ b/src/core/scrapping/scrolling_scrapper/index.ts @@ -8,17 +8,21 @@ import { getConfig } from '../../..'; const {logger} = getConfig(); -export async function getHtmlForPage( +export async function getHtmlForScrollingPage( browser: puppeteer.Browser, url: string, - waitUntilPolicy: PuppeteerPageLoadPolicy + waitUntilPolicy: PuppeteerPageLoadPolicy, + scrollingElementSelector: string, + delayBeforeNextScrollAttemptMillis: number, + numScrollAttempts: number, + scrollDelta: number ): Promise> { try { const page1 = await browser.newPage(); await page1.setViewport({width: 1280, height: 720}); await page1.goto(url, {waitUntil: waitUntilPolicy}); await setTimeout(15000); - await getScrollContent(page1); + await getScrollContent(page1, scrollingElementSelector, delayBeforeNextScrollAttemptMillis, numScrollAttempts, scrollDelta); const html = await page1.content(); return {result: "success", value: {html, forUrl: url}}; } catch (e: any) { @@ -32,33 +36,30 @@ export async function getHtmlForPage( * TODO: Move selector code to individual provider, but keep scroll behavior same across the board. * @param page */ -async function getScrollContent(page: puppeteer.Page): Promise { +async function getScrollContent( + page: puppeteer.Page, + scrollingElementSelector: string, + delayBeforeNextScrollAttemptMillis: number, + numScrollAttempts: number, + scrollDelta: number): Promise { logger.trace("Running scroll down function"); - const section = await page.$('.biab_body.contentWrap'); // find containing body of the content. In this case it's a
+ const section = await page.$(scrollingElementSelector); // find containing body of the content. In this case it's a
if (section !== null) { - logger.trace("Found section"); - - /** - * Using a set number of scrolls to fetch new content. - * Chose this method for simplicity, but a more advanced method - * would check for no changes in the dimensions of the bounding - * box to determine that no new content is available. - */ - const numScrolls = 30; + logger.info("Found scroll section"); let counter = 1; - const delayBetweenScrollsMills = 2000; // give time for the page to make AJAX call for new content. - for await (const value of setInterval(delayBetweenScrollsMills, numScrolls)) { + for await (const value of setInterval(delayBeforeNextScrollAttemptMillis, numScrollAttempts)) { if (counter > value) { break; // stop scrolling for new data } else { const boundingBox = await getBoundingBox(section); - scrollDown(page, boundingBox); + scrollDown(page, boundingBox, scrollDelta); + counter = counter + 1; } } return true; } else { - logger.trace("Failed to find section."); + logger.error("Failed to find scroll section."); return false; } } @@ -78,7 +79,7 @@ async function getBoundingBox(elementHandle: puppeteer.ElementHandle): Promise

{ +async function scrollDown(page: puppeteer.Page, boundingBox: puppeteer.BoundingBox, scrollDelta: number): Promise { // move mouse to the center of the element to be scrolled page.mouse.move( boundingBox.x + boundingBox.width / 2, @@ -86,5 +87,5 @@ async function scrollDown(page: puppeteer.Page, boundingBox: puppeteer.BoundingB ); // use the mouse scroll wheel to to scroll. Change scroll down delta according to your needs. - await page.mouse.wheel({deltaY: 300}); + await page.mouse.wheel({deltaY: scrollDelta}); } diff --git a/src/testbed/build_scrapers/orbit/three_way/index.ts b/src/testbed/build_scrapers/orbit/three_way/index.ts index e69de29..ef21c1d 100644 --- a/src/testbed/build_scrapers/orbit/three_way/index.ts +++ b/src/testbed/build_scrapers/orbit/three_way/index.ts @@ -0,0 +1,71 @@ +//@ts-ignore +import * as cheerio from "cheerio"; + +import { getConfig } from "../../../.."; +import { readFileAsync } from "../../../../utils/file_system"; + +const {logger} = getConfig(); + +class OrbitThreeWayTestBed { + public async run() { + const htmlDataResult = await readFileAsync("data/test_html/orbit/football.html"); + if (htmlDataResult.result === "success") { + logger.info("Success fetching html"); + const $ = cheerio.load(htmlDataResult.value); + + + /** + * Bypassing .biab_group-markets-table div and going directly to .rowContainer + */ + $("div.rowsContainer").each((_,element) => { + const data = $(element).find("div.biab_group-markets-table-row"); + + data.each((_, element_1) => { + const teamNames = $(element_1).find("div > div.biab_market-title-team-names"); + const clubA = $(teamNames).find("p:nth-child(1)").text().trim(); + const clubB = $(teamNames).find("p:nth-child(2)").text().trim(); + + const numBets = $(element_1).find("div > span.cursor-help").text().trim(); + + const oddsWrapper = $(element_1).find("div.styles_betContent__wrapper__25jEo"); + + //const oddFinder = "div.styles_contents__Kf8LQ > button > span > div > span.styles_betOdds__bxapE"; + + const odds = $(oddsWrapper).find("div.styles_contents__Kf8LQ > button > span > div > span.styles_betOdds__bxapE"); + + // expecting 3 pairs of odds for W,D,L. So 6 in total + const oddsArray = []; + odds.each((_, element_2) => { + oddsArray.push($(element_2).text().trim()); + logger.trace($(element_2).text().trim()); + }); + + + logger.trace(`${clubA} vs ${clubB}`); + logger.trace("numBets: ", numBets); + //logger.trace("odd1: ", $(oddsWrapper).find(`${oddFinder}:nth-child(1)`)); + }); + }); + } else { + const message = "Could not get html data"; + throw new Error(message); + } + } + + /** + * Remove games with missing odds data. + * @param oddsArray + */ + isValid(oddsArray: string[]): boolean { + let missingOddFound = false; + oddsArray.forEach(odd => { + if (odd === ""){ + missingOddFound = true; + } + }) + return !missingOddFound; + } +} + +const testBed = new OrbitThreeWayTestBed(); +testBed.run();