Skip to content

Commit

Permalink
Merge pull request #60 from knguy22/master
Browse files Browse the repository at this point in the history
Update dataset for 2024
  • Loading branch information
shaido987 authored Jul 12, 2024
2 parents 00d942c + 844e159 commit 8d325ef
Show file tree
Hide file tree
Showing 6 changed files with 21,879 additions and 13,622 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
---

Creates a dataset from novelupdates (https://www.novelupdates.com) containing information about translated novels.
The dataset contains translated English novels from eight different original languages (chinese, japanese, korean, malaysian, filipino, indonesian, khmer, thai). There is currently a total of **13,592** novels.
The dataset contains translated English novels from eight different original languages (chinese, japanese, korean, malaysian, filipino, indonesian, khmer, thai). There is currently a total of **21,831** novels.

Current Version: 0.1.3
Updated on 2022-10-18
Current Version: 0.1.4
Updated on 2024-07-10


Dataset columns:
Expand Down
24 changes: 12 additions & 12 deletions create_graph.ipynb

Large diffs are not rendered by default.

Binary file modified graph.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
35,425 changes: 21,832 additions & 13,593 deletions novels_0.1.3.csv → novels_0.1.4.csv

Large diffs are not rendered by default.

Binary file modified requirements.txt
Binary file not shown.
46 changes: 32 additions & 14 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
import re
import cfscrape
import cloudscraper
import argparse
import pandas as pd
import numpy as np
from time import sleep
from bs4 import BeautifulSoup
from utils import get_value, str2bool, get_value_str_txt, is_empty, progressbar


class NovelScraper:
"""
Scrapes novel information from novelupdates, http://www.novelupdates.com/.
Constant web links:
* NOVEL_LIST_URL: http://www.novelupdates.com/novelslisting/?st=1&pg=
The URL of the series listings. A number is added to the end depending on the wanted tab.
* NOVEL_LIST_URL: https://www.novelupdates.com/series-ranking/?rank=sixmonths&pg=
The URL of the series rankings. A number is added to the end depending on the wanted tab.
Series listings used to be used, but novelupdates changed their layout so that only 100 pages of ids are
available. Thus, rankings are the only way to get all novels now even though constant ordering is not guaranteed.
* NOVEL_SINGLE_URL: http://www.novelupdates.com/?p=
The URL of a single novel, an ID number needs to be added to the end for the specific novel.
Expand All @@ -26,22 +27,36 @@ class NovelScraper:
def __init__(self, delay=1.0, debug=False):
self.delay = delay
self.debug = debug
self.NOVEL_LIST_URL = "http://www.novelupdates.com/novelslisting/?st=1&pg="
self.NOVEL_LIST_URL = "https://www.novelupdates.com/series-ranking/?rank=sixmonths&pg="
self.NOVEL_SINGLE_URL = "http://www.novelupdates.com/?p="
self.scraper = cfscrape.create_scraper(delay=10)
self.scraper = cloudscraper.create_scraper(
interpreter="nodejs",
delay=10,
browser={
"browser": "chrome",
"platform": "ios",
"desktop": False,
},
)

def parse_all_novels(self):
"""
Parses and scrapes information from all novel pages.
:returns: A list of dictionaries with all scraped and cleaned information of the novels.
"""
novel_ids = self.get_all_novel_ids()
print("Found {} unique novels to parse.".format(len(novel_ids)))

all_novel_information = []
for novel_id in progressbar(novel_ids, prefix="Parsing novels: ", suffix="current novel id: "):
info = self.parse_single_novel(novel_id)
all_novel_information.append(info)
sleep(self.delay)
try:
info = self.parse_single_novel(novel_id)
all_novel_information.append(info)
sleep(self.delay)
except Exception as e:
# give time just in case for novelupdates to recover
sleep(self.delay * 10)
print(f"Successfully parsed {len(all_novel_information)} novels.")
return all_novel_information

def parse_single_novel(self, novel_id):
Expand Down Expand Up @@ -80,16 +95,19 @@ def get_all_novel_ids(self):
novels_num_pages = 1
print('Debug run, using 1 page with novels.')
else:
print('Full run, obtaining number of pages with novels...')
page = self.scraper.get(self.NOVEL_LIST_URL + '1')
novels_num_pages = self.get_novel_list_num_pages(page)
print('Full run, pages with novels:', novels_num_pages)
sleep(self.delay)

all_novel_ids = []
print('Estimated number of novels: {}'.format(novels_num_pages * 25))
all_novel_ids = set()
page_nums = progressbar(range(1, novels_num_pages + 1), prefix="Obtaining novel ids: ", suffix="current page: ")
for page_num in page_nums:
page = self.scraper.get(self.NOVEL_LIST_URL + str(page_num))
novel_ids = self.get_novel_ids(page)
all_novel_ids.extend(novel_ids)
all_novel_ids.update(novel_ids)
sleep(self.delay)
return all_novel_ids

Expand Down Expand Up @@ -120,7 +138,7 @@ def get_novel_ids(page):
table = soup.find('div', attrs={'class': 'w-blog-content other'})
novels = table.find_all('div', attrs={'class': 'search_title'})
novel_ids = [novel.find('span', attrs={'class': 'rl_icons_en'}).get('id')[3:] for novel in novels]
novel_ids = [int(n) for n in novel_ids]
novel_ids = {int(n) for n in novel_ids}
return novel_ids

@staticmethod
Expand Down Expand Up @@ -200,7 +218,7 @@ def chapter_info(content):
table = content.find('table', attrs={'id': 'myTable'})
if table is not None:
release_table = table.find('tbody')
chap_info['chapter_latest_translated'] = release_table.find('tr').find_all('td')[2].a.string.strip()
chap_info['chapter_latest_translated'] = release_table.find('tr').find_all('td')[2].span.string.strip()
return chap_info

@staticmethod
Expand Down Expand Up @@ -279,7 +297,7 @@ def relation_info(content):
parser.add_argument('--debug', type=str2bool, nargs='?', const=True, default=False)
parser.add_argument('--delay', type=float, default=0.5)
parser.add_argument('--novel_id', type=int, default=-1)
parser.add_argument('--version_number', type=str, default='0.1.3')
parser.add_argument('--version_number', type=str, default='0.1.4')
args = parser.parse_args()

novel_scraper = NovelScraper(args.delay, args.debug)
Expand Down

0 comments on commit 8d325ef

Please sign in to comment.