From 3e9bead0245349ff4d76dd8f19f6c2937efc0fc4 Mon Sep 17 00:00:00 2001 From: Nigel Nindo Date: Tue, 17 Oct 2023 11:15:43 +0300 Subject: [PATCH] Add scrollable content --- src/bet_providers/orbit.ts | 14 +++ src/config/betika.json | 5 ++ src/config/orbit.json | 10 +++ src/core/scrapping/betika/index.ts | 7 +- src/core/scrapping/orbit/index.ts | 73 +++++++++++++++ .../scrapping/scrolling_scrapper/index.ts | 90 +++++++++++++++++++ src/core/scrapping/simple_scraper/index.ts | 4 +- .../build_scrapers/orbit/three_way/index.ts | 0 src/testbed/testbed.ts | 8 +- src/utils/types/common/index.ts | 3 +- 10 files changed, 204 insertions(+), 10 deletions(-) create mode 100644 src/bet_providers/orbit.ts create mode 100644 src/config/orbit.json create mode 100644 src/core/scrapping/orbit/index.ts create mode 100644 src/core/scrapping/scrolling_scrapper/index.ts create mode 100644 src/testbed/build_scrapers/orbit/three_way/index.ts diff --git a/src/bet_providers/orbit.ts b/src/bet_providers/orbit.ts new file mode 100644 index 0000000..88b0213 --- /dev/null +++ b/src/bet_providers/orbit.ts @@ -0,0 +1,14 @@ +import { BetProvider } from "."; +import { BetProviders, Games } from "../utils/types/common"; + +export class OrbitProvider extends BetProvider { + constructor() { + super(BetProviders.ORBIT, "src/config/orbit.json"); + } + + override getSupportedGames(): Games[] { + return [ + Games.FOOTBALL + ] + } +} diff --git a/src/config/betika.json b/src/config/betika.json index a19226a..a6022a8 100644 --- a/src/config/betika.json +++ b/src/config/betika.json @@ -10,6 +10,11 @@ "name": "Basketball", "betType": "Three Way", "url": "https://www.betika.com/lite/en-ke/?sport_id=30&tag_id=&tab_id=-2" + }, + { + "name": "Football", + "betType": "Three Way", + "url": "https://www.betika.com/lite/en-ke/?tab_id=-2&page=9&sub_type_id=1,186&sport_id=14" } ] } \ No newline at end of file diff --git a/src/config/orbit.json b/src/config/orbit.json new file mode 100644 index 0000000..d3ff42f --- /dev/null +++ b/src/config/orbit.json @@ -0,0 +1,10 @@ +{ + "version": "1.0.0", + "games": [ + { + "name": "Football", + "betType": "Three Way", + "url": "https://www.orbitxch.com/customer/sport/1" + } + ] +} \ No newline at end of file diff --git a/src/core/scrapping/betika/index.ts b/src/core/scrapping/betika/index.ts index 08e117f..d50ea81 100644 --- a/src/core/scrapping/betika/index.ts +++ b/src/core/scrapping/betika/index.ts @@ -23,6 +23,11 @@ export class BetikaScrapper extends BaseScrapper { this.scrapeIntervalDuration = 10000; } + /** + * Fetches data from Betika. Data is per sport from the lite version of the website, + * which is stored as pages, so fetching contains a mechanisms for scrolling to new pages. + * @returns + */ public async fetchData(): Promise> { const getBetProviderConfigResult = await this.betProvider.getConfig(); @@ -55,7 +60,7 @@ export class BetikaScrapper extends BaseScrapper { const getHtmlResult = await getHtmlForPage(browserInstance, completedUrl, PuppeteerPageLoadPolicy.DOM_CONTENT_LOADED); if (getHtmlResult.result === "success") { - logger.info("Successfully fetched html for url", metadata); + logger.info("Successfully fetched html for url. ", metadata); if (this.pageHasNoGameEvents(getHtmlResult.value.html)) { logger.info("No game events found. Stopping HTML fetch for current game.", metadata); break; diff --git a/src/core/scrapping/orbit/index.ts b/src/core/scrapping/orbit/index.ts new file mode 100644 index 0000000..3a818ac --- /dev/null +++ b/src/core/scrapping/orbit/index.ts @@ -0,0 +1,73 @@ +import { BaseScrapper } from ".."; +import { getConfig } from "../../.."; +import { BetProvider } from "../../../bet_providers"; +import { OrbitProvider } from "../../../bet_providers/orbit"; +import { RedisSingleton } from "../../../datastores/redis"; +import { PuppeteerPageLoadPolicy } from "../../../utils/types/common"; +import { Result } from "../../../utils/types/result_type"; +import { getHtmlForPage } from "../scrolling_scrapper"; + +const {logger} = getConfig(); + +export class OrbitScrapper extends BaseScrapper { + public override betProvider: BetProvider; + public override scrapeIntervalDuration: number; + + constructor() { + super(); + this.betProvider = new OrbitProvider(); + this.scrapeIntervalDuration = 10000; + } + + /** + * Fetch data from Orbit, which actually mirrors BetFair data. Data is fetched per sport, which + * is available as an infinite scrolling list. + * @returns + */ + public async fetchData(): Promise> { + const getBetProviderConfigResult = await this.betProvider.getConfig(); + + if (getBetProviderConfigResult.result === "error") { + logger.error("Failed to get config for provider: ", this.betProvider); + return getBetProviderConfigResult; + } + + const getRedisPublisherResult = await RedisSingleton.getPublisher(); + + if (getRedisPublisherResult.result === "success") { + const betProviderConfig = getBetProviderConfigResult.value; + const browserInstance = await this.initializeBrowserInstance(); + + const result = betProviderConfig.games.map(async game => { + const metadata = { + betProviderName: this.betProvider.name, + game: game.name, + url: game.url + }; + + logger.info("New request to fetch game events: ", metadata); + + const getHtmlResult = await getHtmlForPage(browserInstance, game.url, PuppeteerPageLoadPolicy.LOAD); + + if (getHtmlResult.result === "success") { + logger.info("Successfully fetched html for url. ", metadata); + logger.info(getHtmlResult.value.html); + } else { + logger.error("An error occurred while fetching html for page", metadata); + } + + return undefined; + }); + + await Promise.all(result); + await browserInstance.close(); + + return { + result: "success", + value: true + }; + } else { + return getRedisPublisherResult; + } + } +} \ No newline at end of file diff --git a/src/core/scrapping/scrolling_scrapper/index.ts b/src/core/scrapping/scrolling_scrapper/index.ts new file mode 100644 index 0000000..ab5bb35 --- /dev/null +++ b/src/core/scrapping/scrolling_scrapper/index.ts @@ -0,0 +1,90 @@ +import { setTimeout, setInterval } from "timers/promises"; + +import * as puppeteer from 'puppeteer'; + +import { PuppeteerPageLoadPolicy, SimpleWebPage } from '../../../utils/types/common'; +import { Result } from '../../../utils/types/result_type'; +import { getConfig } from '../../..'; + +const {logger} = getConfig(); + +export async function getHtmlForPage( + browser: puppeteer.Browser, + url: string, + waitUntilPolicy: PuppeteerPageLoadPolicy +): Promise> { + try { + const page1 = await browser.newPage(); + await page1.setViewport({width: 1280, height: 720}); + await page1.goto(url, {waitUntil: waitUntilPolicy}); + await setTimeout(15000); + await getScrollContent(page1); + const html = await page1.content(); + return {result: "success", value: {html, forUrl: url}}; + } catch (e: any) { + const message = `An exception occurred while fetching data from scrolling page for url | ${url}`; + logger.error(message, e.message); + return {result: "error", value: new Error(e.message)}; + } +} + +/** + * TODO: Move selector code to individual provider, but keep scroll behavior same across the board. + * @param page + */ +async function getScrollContent(page: puppeteer.Page): Promise { + logger.trace("Running scroll down function"); + const section = await page.$('.biab_body.contentWrap'); // find containing body of the content. In this case it's a
+ if (section !== null) { + logger.trace("Found section"); + + /** + * Using a set number of scrolls to fetch new content. + * Chose this method for simplicity, but a more advanced method + * would check for no changes in the dimensions of the bounding + * box to determine that no new content is available. + */ + const numScrolls = 30; + let counter = 1; + const delayBetweenScrollsMills = 2000; // give time for the page to make AJAX call for new content. + + for await (const value of setInterval(delayBetweenScrollsMills, numScrolls)) { + if (counter > value) { + break; // stop scrolling for new data + } else { + const boundingBox = await getBoundingBox(section); + scrollDown(page, boundingBox); + } + } + return true; + } else { + logger.trace("Failed to find section."); + return false; + } +} + +/** + * Get the bounding box for the element to be scrolled. + * @param elementHandle + * @returns + */ +async function getBoundingBox(elementHandle: puppeteer.ElementHandle): Promise { + const boundingBox = await elementHandle.boundingBox(); + if (boundingBox !== null) { + logger.trace(boundingBox); + return boundingBox; + } else { + throw new Error("Failed to find bounding box for provided element"); + } +} + +async function scrollDown(page: puppeteer.Page, boundingBox: puppeteer.BoundingBox): Promise { + // move mouse to the center of the element to be scrolled + page.mouse.move( + boundingBox.x + boundingBox.width / 2, + boundingBox.y + boundingBox.height / 2 + ); + + // use the mouse scroll wheel to to scroll. Change scroll down delta according to your needs. + await page.mouse.wheel({deltaY: 300}); +} diff --git a/src/core/scrapping/simple_scraper/index.ts b/src/core/scrapping/simple_scraper/index.ts index e5c2252..8efbf35 100644 --- a/src/core/scrapping/simple_scraper/index.ts +++ b/src/core/scrapping/simple_scraper/index.ts @@ -25,12 +25,12 @@ export async function getHtmlForPage( try { const page1 = await browser.newPage(); await page1.goto(url, {waitUntil: waitUntilPolicy}); - await setTimeout(3000); // wait for some time before fetching content + await setTimeout(15000); // wait for some time before fetching content const html = await page1.content(); await page1.close(); return {result: "success", value: {html, forUrl: url}}; } catch (e: any) { - const message = `An exception occurred while fetching simple web page for url | ${url}` + const message = `An exception occurred while fetching simple web page for url | ${url}`; logger.error(message, e.message); return {result: "error", value: new Error(e.message)}; } diff --git a/src/testbed/build_scrapers/orbit/three_way/index.ts b/src/testbed/build_scrapers/orbit/three_way/index.ts new file mode 100644 index 0000000..e69de29 diff --git a/src/testbed/testbed.ts b/src/testbed/testbed.ts index c4f60eb..569d801 100644 --- a/src/testbed/testbed.ts +++ b/src/testbed/testbed.ts @@ -1,8 +1,4 @@ -import { BetikaProvider } from "../bet_providers/betika"; -import { BetikaScrapper } from "../core/scrapping/betika"; +import { OrbitScrapper } from "../core/scrapping/orbit"; -const betikaProvider = new BetikaProvider(); -betikaProvider.getConfig(); - -const betikaScrapper = new BetikaScrapper(); +const betikaScrapper = new OrbitScrapper(); betikaScrapper.fetchData(); diff --git a/src/utils/types/common/index.ts b/src/utils/types/common/index.ts index 9684b32..28a12d4 100644 --- a/src/utils/types/common/index.ts +++ b/src/utils/types/common/index.ts @@ -1,6 +1,7 @@ export enum BetProviders { BETIKA = "BETIKA", - SPORTPESA = "SPORTPESA" + SPORTPESA = "SPORTPESA", + ORBIT = "ORBIT" } export enum Games {