Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add integration with turnitin/plagiabot/EranBot #24

Merged
merged 8 commits into from
Jan 20, 2016
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ def index():
update_sites()
query = do_check()
return render_template(
"index.mako", notice=notice, query=query, result=query.result)
"index.mako", notice=notice, query=query, result=query.result,
turnitin_result=query.turnitin_result)

@app.route("/settings", methods=["GET", "POST"])
@catch_errors
Expand Down
8 changes: 8 additions & 0 deletions copyvios/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from .misc import Query, get_db
from .sites import get_site
from .turnitin import search_turnitin

__all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]

Expand Down Expand Up @@ -63,9 +64,16 @@ def _get_results(query, follow=True):
conn = get_db()
use_engine = 0 if query.use_engine in ("0", "false") else 1
use_links = 0 if query.use_links in ("0", "false") else 1
use_turnitin = 0 if query.turnitin in ("0", "false") else 1
if not use_engine and not use_links:
query.error = "no search method"
return

# Handle the turnitin check
if use_turnitin:
query.turnitin_result = search_turnitin(query.title, query.lang)

# Handle the copyvio check
mode = "{0}:{1}:".format(use_engine, use_links)
if not _coerce_bool(query.nocache):
query.result = _get_cached_results(
Expand Down
4 changes: 4 additions & 0 deletions copyvios/misc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-

import datetime
from os.path import expanduser

from flask import g, request
Expand Down Expand Up @@ -64,6 +65,9 @@ def httpsfix(context, url):
url = url[len("http:"):]
return url

def parse_wiki_timestamp(timestamp):
return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S')

def urlstrip(context, url):
if url.startswith("http://"):
url = url[7:]
Expand Down
104 changes: 104 additions & 0 deletions copyvios/turnitin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# -*- coding: utf-8 -*-
from ast import literal_eval
import re

import requests

from .misc import parse_wiki_timestamp

__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']

TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'

def search_turnitin(page_title, lang):
""" Search the Plagiabot database for Turnitin reports for a page.

Keyword arguments:
page_title -- string containing the page title
lang -- string containing the page's project language code

Return a TurnitinResult (contains a list of TurnitinReports).
"""
return TurnitinResult(_make_api_request(page_title, lang))

def _make_api_request(page_title, lang):
""" Query the plagiabot API for Turnitin reports for a given page.
"""
stripped_page_title = page_title.replace(' ', '_')
api_parameters = {'action': 'suspected_diffs',
'page_title': stripped_page_title,
'lang': lang,
'report': 1}

result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
# use literal_eval to *safely* parse the resulting dict-containing string
parsed_api_result = literal_eval(result.text)
return parsed_api_result

class TurnitinResult:
""" Container class for TurnitinReports. Each page may have zero or
more reports of plagiarism. The list will have multiple
TurnitinReports if plagiarism has been detected for more than one
revision.

TurnitinResult.reports -- list containing >= 0 TurnitinReport items
"""
def __init__(self, turnitin_data):
"""
Keyword argument:
turnitin_data -- plagiabot API result
"""
self.reports = []
for item in turnitin_data:
report = TurnitinReport(
item['diff_timestamp'], item['diff'], item['report'])
self.reports.append(report)

def __repr__(self):
return str(self.__dict__)

class TurnitinReport:
""" Contains data for each Turnitin report (one on each potentially
plagiarized revision).

TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
TurnitinReport.diffid -- diff ID from Wikipedia database
TurnitinReport.time_posted -- datetime of the time the diff posted
TurnitinReport.sources -- list of dicts with information on:
percent -- percent of revision found in source as well
words -- number of words found in both source and revision
url -- url for the possibly-plagiarized source
"""
def __init__(self, timestamp, diffid, report):
"""
Keyword argument:
timestamp -- diff timestamp from Wikipedia database
diffid -- diff ID from Wikipedia database
report -- Turnitin report from the plagiabot database
"""
self.report_data = self._parse_report(report)
self.reportid = self.report_data[0]
self.diffid = diffid
self.time_posted = parse_wiki_timestamp(timestamp)

self.sources = []
for item in self.report_data[1]:
source = {'percent': item[0],
'words': item[1],
'url': item[2]}
self.sources.append(source)

def __repr__(self):
return str(self.__dict__)

def _parse_report(self, report_text):
# extract report ID
report_id_pattern = re.compile(r'\?rid=(\d*)')
report_id = report_id_pattern.search(report_text).groups()[0]

# extract percent match, words, and URL for each source in the report
extract_info_pattern = re.compile(
r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure you don't need to escape the slash before the n? i.e. '\n' instead of '\n'. Maybe Python's regex engine is smarter than PHPs :)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Python's raw string notation is pretty awesome.

results = extract_info_pattern.findall(report_text)

return (report_id, results)
11 changes: 11 additions & 0 deletions static/style.css
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,17 @@ div#info-box {
margin: 10px 5px;
}

div#turnitin-container {
padding: 5px 10px;
margin: 15px 5px 10px 5px;
}

div#turnitin-title {
margin-bottom: -5px;
text-align: center;
font-weight: bold;
}

div#cv-result {
padding: 5px;
margin: 10px 5px;
Expand Down
27 changes: 27 additions & 0 deletions templates/index.mako
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@
<input class="cv-search" type="hidden" name="use_links" value="0" />
<input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} />
<label for="cv-cb-links">Use&nbsp;links&nbsp;in&nbsp;page</label>
<input class="cv-search" type="hidden" name="use_links" value="0" />
<span style="white-space:nowrap"><input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/>
<label for="cv-cb-turnitin">Search&nbsp;Turnitin&nbsp;reports</label></span>
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This wording seems a bit awkward to me since the reports are based on the search. Maybe "Use Turnitin" or "Use Turnitin database" instead?

</td>
</tr>
<tr>
Expand Down Expand Up @@ -146,6 +149,7 @@
</tr>
</table>
</form>

% if result:
<div id="generation-time">
Results
Expand All @@ -160,6 +164,29 @@
% endif
<a href="${request.script_root | h}?lang=${query.lang | h}&amp;project=${query.project | h}&amp;oldid=${query.oldid or query.page.lastrevid | h}&amp;action=${query.action | h}&amp;${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a>
</div>

% if query.turnitin:
<div id="turnitin-container" class="${'red' if query.turnitin_result.reports else 'green'}-box">
<div id="turnitin-title">Turnitin Results</div>
% if query.turnitin_result.reports:
<p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p>

<table id="turnitin-table"><tbody>
%for report in turnitin_result.reports:
<tr><td id="turnitin-table-cell"><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a> for text added <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> at ${report.time_posted}</a>:
<ul>
% for source in report.sources:
<li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
% endfor
</ul></td></tr>
%endfor
</tbody></table>
% else:
<p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p>
% endif
</div>
% endif

<div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box">
<table id="cv-result-head-table">
<colgroup>
Expand Down