forked from wpoa/open-access-media-importer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoa-put
executable file
·131 lines (111 loc) · 4.68 KB
/
oa-put
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from os import path
from sys import argv, stderr
from time import sleep
from urllib.parse import urlparse
import csv
# csv.field_size_limit must be reset according to
# <http://lethain.com/handling-very-large-csv-and-xml-files-in-python/>
csv.field_size_limit(999999999)
from helpers import config
from helpers import efetch
from helpers import filename_from_url
from helpers import mediawiki
# from helpers import template
from model import session, set_source # , setup_all, create_all
from model import Article, Journal, SupplementaryMaterial
try:
action = argv[1]
target = argv[2]
except IndexError: # no arguments given
print("""
oa-put – Open Access Importer upload operations
usage: oa-put upload-media [source]
""")
exit(1)
try:
assert(action in ['upload-media'])
except AssertionError: # invalid action
print("Unknown action “%s”.\n" % action)
exit(2)
try:
exec("from sources import %s as source_module" % target)
except ImportError: # invalid source
print("Unknown source “%s”.\n" % target)
exit(3)
set_source(target)
# setup_all(True)
if action == 'upload-media':
media_refined_directory = config.get_media_refined_source_path(target)
materials = session.query(SupplementaryMaterial).all()
for material in materials:
filename = filename_from_url(material.url) + '.ogg'
media_refined_path = path.join(media_refined_directory, filename)
if path.getsize(media_refined_path) == 0:
material.converted = False
continue
if mediawiki.is_uploaded(material):
print(
"Skipping “%s”, already exists at %s.\n"
% (media_refined_path.encode("utf-8"), mediawiki.get_wiki_name())
)
material.uploaded = True
continue
article_doi = material.article.doi
article_pmid = efetch.get_pmid_from_doi(article_doi)
article_pmcid = efetch.get_pmcid_from_doi(article_doi)
authors = material.article.contrib_authors
article_title = material.article.title
journal_title = material.article.journal.title
article_year = material.article.year
article_month = material.article.month
article_day = material.article.day
article_url = material.article.url
license_url = material.article.license_url
rights_holder = material.article.copyright_holder
label = material.label
title = material.title
caption = material.caption
mimetype = material.mimetype
material_url = material.url
categories = [category.name for category in material.article.categories]
if article_pmid is not None:
categories += efetch.get_categories_from_pmid(article_pmid)
# TODO: file extension should be adapted for other file formats
url_path = urlparse.urlsplit(material.url).path
source_filename = url_path.split('/')[-1]
assert(mimetype in ('audio', 'video'))
if mimetype == 'audio':
extension = 'oga'
elif mimetype == 'video':
extension = 'ogv'
wiki_filename = path.splitext(source_filename)[0] + '.' + extension
if article_title is not None:
dirty_prefix = article_title
dirty_prefix = dirty_prefix.replace('\n', '')
dirty_prefix = ' '.join(dirty_prefix.split()) # remove multiple spaces
forbidden_chars = u"""?,;:^/!<>"`'±#[]|{}ʻʾʿ᾿῾‘’“”"""
for character in forbidden_chars:
dirty_prefix = dirty_prefix.replace(character, '')
# prefix is first hundred chars of title sans forbidden characters
prefix = '-'.join(dirty_prefix[:100].split(' '))
# if original title is longer than cleaned up title, remove last word
if len(dirty_prefix) > len(prefix):
prefix = '-'.join(prefix.split('-')[:-1])
if prefix[-1] != '-':
prefix += '-'
wiki_filename = prefix + wiki_filename
page_template = template.page(article_doi, article_pmid, \
article_pmcid, authors, article_title, journal_title, \
article_year, article_month, article_day, article_url, \
license_url, label, caption, title, categories, mimetype, \
material_url)
mediawiki.upload(media_refined_path, wiki_filename, page_template)
print("“%s” uploaded to <%s>.\n" % (
media_refined_path.encode('utf-8'),
config.api_url.encode('utf-8')
))
material.uploaded = True
session.commit()
sleep(10) # 6 uploads per minute