Merge pull request #24 from fhocutt/master

Add integration with turnitin/plagiabot/EranBot
earwig · Jan 20, 2016 · d31e24f · d31e24f
2 parents f84a0b4 + 9a4dde1
commit d31e24f
Show file tree

Hide file tree

Showing 6 changed files with 156 additions and 1 deletion.
diff --git a/app.py b/app.py
@@ -103,7 +103,8 @@ def index():
     update_sites()
     query = do_check()
     return render_template(
-        "index.mako", notice=notice, query=query, result=query.result)
+        "index.mako", notice=notice, query=query, result=query.result,
+        turnitin_result=query.turnitin_result)
 
 @app.route("/settings", methods=["GET", "POST"])
 @catch_errors

diff --git a/copyvios/checker.py b/copyvios/checker.py
@@ -11,6 +11,7 @@
 
 from .misc import Query, get_db
 from .sites import get_site
+from .turnitin import search_turnitin
 
 __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
 
@@ -63,9 +64,16 @@ def _get_results(query, follow=True):
         conn = get_db()
         use_engine = 0 if query.use_engine in ("0", "false") else 1
         use_links = 0 if query.use_links in ("0", "false") else 1
+        use_turnitin = 0 if query.turnitin in ("0", "false") else 1
         if not use_engine and not use_links:
             query.error = "no search method"
             return
+
+        # Handle the turnitin check
+        if use_turnitin:
+            query.turnitin_result = search_turnitin(query.title, query.lang)
+
+        # Handle the copyvio check
         mode = "{0}:{1}:".format(use_engine, use_links)
         if not _coerce_bool(query.nocache):
             query.result = _get_cached_results(

diff --git a/copyvios/misc.py b/copyvios/misc.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8  -*-
 
+import datetime
 from os.path import expanduser
 
 from flask import g, request
@@ -64,6 +65,9 @@ def httpsfix(context, url):
         url = url[len("http:"):]
     return url
 
+def parse_wiki_timestamp(timestamp):
+    return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S')
+
 def urlstrip(context, url):
     if url.startswith("http://"):
         url = url[7:]

diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+from ast import literal_eval
+import re
+
+import requests
+
+from .misc import parse_wiki_timestamp
+
+__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
+
+TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
+
+def search_turnitin(page_title, lang):
+    """ Search the Plagiabot database for Turnitin reports for a page.
+
+    Keyword arguments:
+    page_title -- string containing the page title
+    lang       -- string containing the page's project language code
+
+    Return a TurnitinResult (contains a list of TurnitinReports).
+    """
+    return TurnitinResult(_make_api_request(page_title, lang))
+
+def _make_api_request(page_title, lang):
+    """ Query the plagiabot API for Turnitin reports for a given page.
+    """
+    stripped_page_title = page_title.replace(' ', '_')
+    api_parameters = {'action': 'suspected_diffs',
+                      'page_title': stripped_page_title,
+                      'lang': lang,
+                      'report': 1}
+
+    result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
+    # use literal_eval to *safely* parse the resulting dict-containing string
+    parsed_api_result = literal_eval(result.text)
+    return parsed_api_result
+
+class TurnitinResult:
+    """ Container class for TurnitinReports. Each page may have zero or
+    more reports of plagiarism. The list will have multiple
+    TurnitinReports if plagiarism has been detected for more than one
+    revision.
+
+    TurnitinResult.reports -- list containing >= 0 TurnitinReport items
+    """
+    def __init__(self, turnitin_data):
+        """
+        Keyword argument:
+        turnitin_data -- plagiabot API result
+        """
+        self.reports = []
+        for item in turnitin_data:
+            report = TurnitinReport(
+                item['diff_timestamp'], item['diff'], item['report'])
+            self.reports.append(report)
+
+    def __repr__(self):
+        return str(self.__dict__)
+
+class TurnitinReport:
+    """ Contains data for each Turnitin report (one on each potentially
+    plagiarized revision).
+
+    TurnitinReport.reportid  -- Turnitin report ID, taken from plagiabot
+    TurnitinReport.diffid    -- diff ID from Wikipedia database
+    TurnitinReport.time_posted -- datetime of the time the diff posted
+    TurnitinReport.sources   -- list of dicts with information on:
+        percent -- percent of revision found in source as well
+        words   -- number of words found in both source and revision
+        url     -- url for the possibly-plagiarized source
+    """
+    def __init__(self, timestamp, diffid, report):
+        """
+        Keyword argument:
+        timestamp  -- diff timestamp from Wikipedia database
+        diffid     -- diff ID from Wikipedia database
+        report     -- Turnitin report from the plagiabot database
+        """
+        self.report_data = self._parse_report(report)
+        self.reportid = self.report_data[0]
+        self.diffid = diffid
+        self.time_posted = parse_wiki_timestamp(timestamp)
+
+        self.sources = []
+        for item in self.report_data[1]:
+            source = {'percent': item[0],
+                      'words': item[1],
+                      'url': item[2]}
+            self.sources.append(source)
+
+    def __repr__(self):
+        return str(self.__dict__)
+
+    def _parse_report(self, report_text):
+        # extract report ID
+        report_id_pattern = re.compile(r'\?rid=(\d*)')
+        report_id = report_id_pattern.search(report_text).groups()[0]
+
+        # extract percent match, words, and URL for each source in the report
+        extract_info_pattern = re.compile(
+            r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
+        results = extract_info_pattern.findall(report_text)
+
+        return (report_id, results)
diff --git a/static/style.css b/static/style.css
@@ -63,6 +63,17 @@ div#info-box {
     margin: 10px 5px;
 }
 
+div#turnitin-container {
+    padding: 5px 10px;
+    margin: 15px 5px 10px 5px;
+}
+
+div#turnitin-title {
+    margin-bottom: -5px;
+    text-align: center;
+    font-weight: bold;
+}
+
 div#cv-result {
     padding: 5px;
     margin: 10px 5px;

diff --git a/templates/index.mako b/templates/index.mako
@@ -113,6 +113,9 @@
                             <input class="cv-search" type="hidden" name="use_links" value="0" />
                             <input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} />
                             <label for="cv-cb-links">Use&nbsp;links&nbsp;in&nbsp;page</label>
+                            <input class="cv-search" type="hidden" name="use_links" value="0" />
+                            <span style="white-space:nowrap"><input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/>
+                            <label for="cv-cb-turnitin">Use&nbsp;Turnitin&nbsp;database</label></span>
                         </td>
                     </tr>
                     <tr>
@@ -146,6 +149,7 @@
         </tr>
     </table>
 </form>
+
 % if result:
     <div id="generation-time">
         Results
@@ -160,6 +164,29 @@
         % endif
         <a href="${request.script_root | h}?lang=${query.lang | h}&amp;project=${query.project | h}&amp;oldid=${query.oldid or query.page.lastrevid | h}&amp;action=${query.action | h}&amp;${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a>
     </div>
+
+    % if query.turnitin:
+        <div id="turnitin-container" class="${'red' if query.turnitin_result.reports else 'green'}-box">
+            <div id="turnitin-title">Turnitin Results</div>
+            % if query.turnitin_result.reports:
+                <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p>
+
+                <table id="turnitin-table"><tbody>
+                %for report in turnitin_result.reports:
+                    <tr><td id="turnitin-table-cell"><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a> for text added <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> at ${report.time_posted}</a>:
+                    <ul>
+                    % for source in report.sources:
+                          <li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
+                    % endfor
+                    </ul></td></tr>
+                %endfor
+                </tbody></table>
+            % else:
+                <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p>
+            % endif
+        </div>
+    % endif
+
     <div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box">
         <table id="cv-result-head-table">
             <colgroup>