Merge pull request #60 from knguy22/master

Update dataset for 2024
shaido987 · Jul 12, 2024 · 8d325ef · 8d325ef
2 parents 00d942c + 844e159
commit 8d325ef
Show file tree

Hide file tree

Showing 6 changed files with 21,879 additions and 13,622 deletions.
diff --git a/README.md b/README.md
@@ -7,10 +7,10 @@
 ---
 
 Creates a dataset from novelupdates (https://www.novelupdates.com) containing information about translated novels.
-The dataset contains translated English novels from eight different original languages (chinese, japanese, korean, malaysian, filipino, indonesian, khmer, thai). There is currently a total of **13,592** novels.  
+The dataset contains translated English novels from eight different original languages (chinese, japanese, korean, malaysian, filipino, indonesian, khmer, thai). There is currently a total of **21,831** novels.  
 
-Current Version: 0.1.3  
-Updated on 2022-10-18  
+Current Version: 0.1.4
+Updated on 2024-07-10
 
 
 Dataset columns:

diff --git a/create_graph.ipynb b/create_graph.ipynb
diff --git a/graph.png b/graph.png
diff --git a/novels_0.1.3.csv → novels_0.1.4.csv b/novels_0.1.3.csv → novels_0.1.4.csv
diff --git a/requirements.txt b/requirements.txt
diff --git a/scraper.py b/scraper.py
@@ -1,20 +1,21 @@
 import re
-import cfscrape
+import cloudscraper
 import argparse
 import pandas as pd
 import numpy as np
 from time import sleep
 from bs4 import BeautifulSoup
 from utils import get_value, str2bool, get_value_str_txt, is_empty, progressbar
 
-
 class NovelScraper:
     """
     Scrapes novel information from novelupdates, http://www.novelupdates.com/.
     
     Constant web links:
-    * NOVEL_LIST_URL: http://www.novelupdates.com/novelslisting/?st=1&pg=
-      The URL of the series listings. A number is added to the end depending on the wanted tab.
+    * NOVEL_LIST_URL: https://www.novelupdates.com/series-ranking/?rank=sixmonths&pg=
+      The URL of the series rankings. A number is added to the end depending on the wanted tab.
+      Series listings used to be used, but novelupdates changed their layout so that only 100 pages of ids are
+      available. Thus, rankings are the only way to get all novels now even though constant ordering is not guaranteed.
     * NOVEL_SINGLE_URL: http://www.novelupdates.com/?p=
       The URL of a single novel, an ID number needs to be added to the end for the specific novel.
     
@@ -26,22 +27,36 @@ class NovelScraper:
     def __init__(self, delay=1.0, debug=False):
         self.delay = delay
         self.debug = debug
-        self.NOVEL_LIST_URL = "http://www.novelupdates.com/novelslisting/?st=1&pg="
+        self.NOVEL_LIST_URL = "https://www.novelupdates.com/series-ranking/?rank=sixmonths&pg="
         self.NOVEL_SINGLE_URL = "http://www.novelupdates.com/?p="
-        self.scraper = cfscrape.create_scraper(delay=10)
+        self.scraper = cloudscraper.create_scraper(
+            interpreter="nodejs",
+            delay=10,
+            browser={
+                "browser": "chrome",
+                "platform": "ios",
+                "desktop": False,
+            },
+        )
 
     def parse_all_novels(self):
         """
         Parses and scrapes information from all novel pages.
         :returns: A list of dictionaries with all scraped and cleaned information of the novels.
         """
         novel_ids = self.get_all_novel_ids()
+        print("Found {} unique novels to parse.".format(len(novel_ids)))
 
         all_novel_information = []
         for novel_id in progressbar(novel_ids, prefix="Parsing novels: ", suffix="current novel id: "):
-            info = self.parse_single_novel(novel_id)
-            all_novel_information.append(info)
-            sleep(self.delay)
+            try:
+                info = self.parse_single_novel(novel_id)
+                all_novel_information.append(info)
+                sleep(self.delay)
+            except Exception as e:
+                # give time just in case for novelupdates to recover
+                sleep(self.delay * 10)
+        print(f"Successfully parsed {len(all_novel_information)} novels.")
         return all_novel_information
 
     def parse_single_novel(self, novel_id):
@@ -80,16 +95,19 @@ def get_all_novel_ids(self):
             novels_num_pages = 1
             print('Debug run, using 1 page with novels.')
         else:
+            print('Full run, obtaining number of pages with novels...')
             page = self.scraper.get(self.NOVEL_LIST_URL + '1')
             novels_num_pages = self.get_novel_list_num_pages(page)
             print('Full run, pages with novels:', novels_num_pages)
+            sleep(self.delay)
 
-        all_novel_ids = []
+        print('Estimated number of novels: {}'.format(novels_num_pages * 25))
+        all_novel_ids = set()
         page_nums = progressbar(range(1, novels_num_pages + 1), prefix="Obtaining novel ids: ", suffix="current page: ")
         for page_num in page_nums:
             page = self.scraper.get(self.NOVEL_LIST_URL + str(page_num))
             novel_ids = self.get_novel_ids(page)
-            all_novel_ids.extend(novel_ids)
+            all_novel_ids.update(novel_ids)
             sleep(self.delay)
         return all_novel_ids
 
@@ -120,7 +138,7 @@ def get_novel_ids(page):
         table = soup.find('div', attrs={'class': 'w-blog-content other'})
         novels = table.find_all('div', attrs={'class': 'search_title'})
         novel_ids = [novel.find('span', attrs={'class': 'rl_icons_en'}).get('id')[3:] for novel in novels]
-        novel_ids = [int(n) for n in novel_ids]
+        novel_ids = {int(n) for n in novel_ids}
         return novel_ids
 
     @staticmethod
@@ -200,7 +218,7 @@ def chapter_info(content):
         table = content.find('table', attrs={'id': 'myTable'})
         if table is not None:
             release_table = table.find('tbody')
-            chap_info['chapter_latest_translated'] = release_table.find('tr').find_all('td')[2].a.string.strip()
+            chap_info['chapter_latest_translated'] = release_table.find('tr').find_all('td')[2].span.string.strip()
         return chap_info
 
     @staticmethod
@@ -279,7 +297,7 @@ def relation_info(content):
     parser.add_argument('--debug', type=str2bool, nargs='?', const=True, default=False)
     parser.add_argument('--delay', type=float, default=0.5)
     parser.add_argument('--novel_id', type=int, default=-1)
-    parser.add_argument('--version_number', type=str, default='0.1.3')
+    parser.add_argument('--version_number', type=str, default='0.1.4')
     args = parser.parse_args()
 
     novel_scraper = NovelScraper(args.delay, args.debug)