Merge pull request #538 from kabalin/urivalidator

Improve URL parsing in input strings
PastVu · Dec 22, 2022 · 95d046f · 95d046f
2 parents 7453bfa + 16bc0ce
commit 95d046f
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 137 deletions.
diff --git a/commons/Utils.js b/commons/Utils.js
@@ -11,7 +11,6 @@ const _ = require('lodash');
 const _s = require('underscore.string');
 const useragent = require('useragent');
 const DMP = require('diff-match-patch');
-const regexpUrl = require('./regex-weburl');
 const turf = require('@turf/turf');
 const ms = require('ms');
 
@@ -130,7 +129,22 @@ Utils.isType = function (type, obj) {
 Utils.validateEmail = function (email) {
     const emailRegexp = /^(([^<>()[\]\\.,;:\s@"]+(\.[^<>()[\]\\.,;:\s@"]+)*)|(".+"))@(([[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/;
 
-    return email.toLowerCase().match(emailRegexp);
+    return !!email.toLowerCase().match(emailRegexp);
+};
+
+/**
+ * Checks if URI is of valid format.
+ *
+ * We use Diego Perini's validator retrieved from https://gist.github.com/dperini/729294,
+ * MIT licensed, version 2018/09/12, see http://mathiasbynens.be/demo/url-regex for details.
+ *
+ * @param {string} uri
+ * @returns {boolean}
+ */
+Utils.validateURI = function (uri) {
+    const uriRegexp = /^(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?$/i;
+
+    return !!uri.match(uriRegexp);
 };
 
 /**
@@ -282,49 +296,47 @@ Utils.reflectKeys = function (obj) {
 Utils.linkifyUrlString = function (text, target, className) {
     'use strict';
 
-    let matches;
-
     target = target ? ` target="${target}"` : '';
     className = className ? ` class="${className}"` : '';
 
-    //Используем match и вручную перебираем все совпадающие ссылки, чтобы декодировать их с decodeURI,
-    //на случай, если ссылка, содержащая не аски символы, вставлена из строки браузера, вида http://ru.wikipedia.org/wiki/%D0%A1%D0%B5%D0%BA%D1%81
-    //Массив совпадений делаем уникальными (uniq)
-
-    //Starting with http://, https://, or ftp://
-    matches = _.uniq(text.match(regexpUrl));
+    const replaceLink = function (match, url, punctuation) {
+        const append = punctuation || '';
+        let linkText = url;
 
-    for (let i = 0; i < matches.length; i++) {
-        try { // Do nothing if URI malformed (decodeURI fails)
-            const url = decodeURI(matches[i]);
-
-            text = text.replace(matches[i], `<a href="${url}" rel="nofollow noopener"${target}${className}>${url}</a>`);
-        } catch (err) {}
-    }
-
-    //Starting with "www." (without // before it, or it'd re-link the ones done above).
-    const matchPattern = /(^|[^/])(www\.[\S]+(\b|$))/gim;
+        if (/^www\./i.test(url)) {
+            url = url.replace(/^www\./i, 'http://www.');
+        }
 
-    matches = _.uniq(text.match(matchPattern));
+        if (!Utils.validateURI(url)) {
+            // Invalid URL, return original string.
+            return match;
+        }
 
-    for (let i = 0; i < matches.length; i++) {
         try {
-            matches[i] = _s.trim(matches[i]); //Так как в результат match попадут и переносы и пробелы (^|[^\/]), то надо их удалить
-
-            const url = decodeURI(matches[i]);
+            // Decode URI, e.g. to make http://ru.wikipedia.org/wiki/%D0%A1%D0%B5%D0%BA%D1%81 url readable.
+            url = decodeURI(url);
+            linkText = decodeURI(linkText);
+
+            return `<a href="${url}" rel="nofollow noopener"${target}${className}>${linkText}</a>${append}`;
+        } catch (err) {
+            // Malformed URI sequence, return original string.
+            return match;
+        }
+    };
 
-            text = text.replace(matches[i], `<a href="http://${url}" rel="nofollow noopener"${target}${className}>${url}</a>`);
-        } catch (err) {}
-    }
+    // Capture url starting with http://, https://, ftp:// or www, keep
+    // trailing punctuation ([.!?()]) in a separate group, so we append it later.
+    const simpleURLRegex = /\b((?:(?:https?|ftp):\/\/|www\.)[^'">\s]+\.[^'">\s]+?)([.,;!?)]?)(?=\s|$)/gmi;
 
-    return text;
+    return text.replace(simpleURLRegex, replaceLink);
 };
 
 Utils.inputIncomingParse = (function () {
     'use strict';
 
     const host = config.client.host;
     const reversedEscapeChars = { '<': 'lt', '>': 'gt', '"': 'quot', '&': 'amp', "'": '#39' };
+    const trailingChars = '\s).,;>!?'; // eslint-disable-line no-useless-escape
 
     function escape(txt) {
         //Паттерн из _s.escapeHTML(result); исключая амперсант
@@ -338,15 +350,17 @@ Utils.inputIncomingParse = (function () {
 
         //Заменяем ссылку на фото на диез-ссылку #xxx
         //Например, http://domain.com/p/123456 -> #123456
-        result = result.replace(new RegExp(`(\\b)(?:https?://)?(?:www.)?${host}/p/(\\d{1,8})/?(?=[\\s\\)\\.,;>]|$)`, 'gi'), '$1#$2');
+        result = result.replace(new RegExp(`(\\b)(?:https?://)?(?:www.)?${host}/p/(\\d{1,8})/?(?=[${trailingChars}]|$)`, 'gi'), '$1#$2');
+        // /p/123456 -> #123456
+        result = result.replace(new RegExp(`(^|\\s|\\()/p/(\\d{1,8})/?(?=[${trailingChars}]|$)`, 'gi'), '$1#$2');
 
         //Все внутрипортальные ссылки оставляем без доменного имени, от корня
         //Например, http://domain.com/u/klimashkin/photo -> /u/klimashkin/photo
         result = result.replace(new RegExp(`(\\b)(?:https?://)?(?:www.)?${host}(/[-A-Z0-9+&@#\\/%?=~_|!:,.;]*[-A-Z0-9+&@#\\/%=~_|])`, 'gim'), '$1$2');
 
         // Replace links to protected/covered photos with regular link
         // For example, /_pr/a/b/c/abc.jpg -> /_p/a/b/c/abc.jpg
-        result = result.replace(/\/_prn?\/([/a-z0-9]{26,40}\.(?:jpe?g|png))/gi, '/_p/$1');
+        result = result.replace(/\/_prn?\/([/a-z0-9]+\.(?:jpe?g|png))/gi, '/_p/$1');
 
         const plain = result;
 
@@ -359,7 +373,7 @@ Utils.inputIncomingParse = (function () {
 
         //Заменяем диез-ссылку фото #xxx на линк
         //Например, #123456 -> <a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>
-        result = result.replace(/(^|\s|\()#(\d{1,8})(?=[\s).,]|$)/g, '$1<a target="_blank" class="sharpPhoto" href="/p/$2">#$2</a>');
+        result = result.replace(new RegExp(`(^|\\s|\\()#(\\d{1,8})(?=[${trailingChars}]|$)`, 'g'), '$1<a target="_blank" class="sharpPhoto" href="/p/$2">#$2</a>');
 
         result = Utils.linkifyUrlString(result, '_blank'); //Оборачиваем остальные url в ahref
         result = result.replace(/\n{3,}/g, '<br><br>').replace(/\n/g, '<br>'); //Заменяем переносы на <br>

diff --git a/commons/__tests__/Utils.test.js b/commons/__tests__/Utils.test.js
@@ -0,0 +1,95 @@
+/**
+ * Copyright: The PastVu contributors.
+ * GNU Affero General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/agpl.txt)
+ */
+
+import Utils from '../Utils';
+import config from '../../config';
+
+const origin = config.client.origin;
+
+/**
+ * Test Utils.inputIncomingParse output matches expected.
+ */
+const testInputIncomingParse = (desc, testString, expectedString) => {
+    expect.assertions(1);
+    expect(Utils.inputIncomingParse(testString).result).toStrictEqual(expectedString);
+};
+
+describe('utils', () => {
+    describe('incoming input parsing', () => {
+        describe('should strip spaces and replace next line', () => {
+            const testData = [
+                ['string with spaces', ' String with spaces in the middle   and at both ends  ', 'String with spaces in the middle and at both ends'],
+                ['multiline string 1', `line
+                    another line`, 'line<br> another line'],
+                ['multiline string 2', `line
+
+                    another line`, 'line<br><br> another line'],
+                ['multiline string 3', `line
+
+
+                    another line`, 'line<br><br> another line'],
+            ];
+
+            it.each(testData)('%s', testInputIncomingParse); // eslint-disable-line jest/expect-expect
+        });
+
+        describe('should replace internal links', () => {
+            const testData = [
+                ['replace photo url', `${origin}/p/123456`, '<a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>'],
+                ['replace photo path', '/p/123456', '<a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>'],
+                ['replace photo hash', '#123456', '<a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>'],
+                ['replace encoded url', 'https://ru.wikipedia.org/wiki/%D0%A4%D0%BE%D1%82%D0%BE%D0%B3%D1%80%D0%B0%D1%84%D0%B8%D1%8F', '<a href="https://ru.wikipedia.org/wiki/Фотография" rel="nofollow noopener" target="_blank">https://ru.wikipedia.org/wiki/Фотография</a>'],
+                ['shorten internal url', `${origin}/u/klimashkin/photo`, '<a target="_blank" class="innerLink" href="/u/klimashkin/photo">/u/klimashkin/photo</a>'],
+                ['replace internal path', '/u/klimashkin/photo', '<a target="_blank" class="innerLink" href="/u/klimashkin/photo">/u/klimashkin/photo</a>'],
+                ['replace protected photo url', `${origin}/_pr/a/b/c/abc.jpg`, '<a target="_blank" class="innerLink" href="/_p/a/b/c/abc.jpg">/_p/a/b/c/abc.jpg</a>'],
+                ['replace protected photo url 1', `${origin}/_prn/a/b/c/abc.png`, '<a target="_blank" class="innerLink" href="/_p/a/b/c/abc.png">/_p/a/b/c/abc.png</a>'],
+            ];
+
+            it.each(testData)('%s', testInputIncomingParse); // eslint-disable-line jest/expect-expect
+        });
+
+        describe('should respect heading and trailing punctuation for internal links', () => {
+            const testData = [
+                ['photo url', `(${origin}/p/123456) #123456.`, '(<a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>) <a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>.'],
+                ['internal url', `${origin}/u/klimashkin/photo, ${origin}/u/klimashkin/photo; (/u/klimashkin/photo)`, '<a target="_blank" class="innerLink" href="/u/klimashkin/photo">/u/klimashkin/photo</a>, <a target="_blank" class="innerLink" href="/u/klimashkin/photo">/u/klimashkin/photo</a>; (<a target="_blank" class="innerLink" href="/u/klimashkin/photo">/u/klimashkin/photo</a>)'],
+            ];
+
+            it.each(testData)('%s', testInputIncomingParse); // eslint-disable-line jest/expect-expect
+        });
+
+        describe('should replace external links', () => {
+            const testData = [
+                ['replace url', 'https://jestjs.io/docs/expect#expectassertionsnumber', '<a href="https://jestjs.io/docs/expect#expectassertionsnumber" rel="nofollow noopener" target="_blank">https://jestjs.io/docs/expect#expectassertionsnumber</a>'],
+                ['replace www url', 'www.moodle.org', '<a href="http://www.moodle.org" rel="nofollow noopener" target="_blank">www.moodle.org</a>'],
+                ['replace url with params', 'https://jestjs.io/docs/expect?show=all', '<a href="https://jestjs.io/docs/expect?show=all" rel="nofollow noopener" target="_blank">https://jestjs.io/docs/expect?show=all</a>'],
+            ];
+
+            it.each(testData)('%s', testInputIncomingParse); // eslint-disable-line jest/expect-expect
+        });
+
+        describe('should replace external links with punctuation', () => {
+            const testData = [
+                ['replace url', 'Please check https://jestjs.io/docs/expect. This is important.', 'Please check <a href="https://jestjs.io/docs/expect" rel="nofollow noopener" target="_blank">https://jestjs.io/docs/expect</a>. This is important.'],
+                ['replace urls multiline', `Check www.github.com,
+                    and also http://docs.pastvu.com;`, 'Check <a href="http://www.github.com" rel="nofollow noopener" target="_blank">www.github.com</a>,<br> and also <a href="http://docs.pastvu.com" rel="nofollow noopener" target="_blank">http://docs.pastvu.com</a>;'],
+                ['replace identical urls', 'Please check https://jestjs.io/docs/expect,  https://jestjs.io/docs/expect.', 'Please check <a href="https://jestjs.io/docs/expect" rel="nofollow noopener" target="_blank">https://jestjs.io/docs/expect</a>, <a href="https://jestjs.io/docs/expect" rel="nofollow noopener" target="_blank">https://jestjs.io/docs/expect</a>.'],
+            ];
+
+            it.each(testData)('%s', testInputIncomingParse); // eslint-disable-line jest/expect-expect
+        });
+
+        it('should replace links in complex example', () => {
+            expect.assertions(1);
+
+            const testString = `Hello /u/testuser, this photo #123456 (also #123457, #456789)
+                are related and taken from the http://oldtown.com.
+                Please amend the sources. You can find more information on https://docs.pastvu.com; https://docs.pastvu.com?id=3.`;
+
+            const expectedString = 'Hello <a target="_blank" class="innerLink" href="/u/testuser">/u/testuser</a>, this photo #123456 (also <a target="_blank" class="sharpPhoto" href="/p/123457">#123457</a>, <a target="_blank" class="sharpPhoto" href="/p/456789">#456789</a>)<br> are related and taken from the <a href="http://oldtown.com" rel="nofollow noopener" target="_blank">http://oldtown.com</a>.<br> Please amend the sources. You can find more information on <a href="https://docs.pastvu.com" rel="nofollow noopener" target="_blank">https://docs.pastvu.com</a>; <a href="https://docs.pastvu.com?id=3" rel="nofollow noopener" target="_blank">https://docs.pastvu.com?id=3</a>.';
+
+            testInputIncomingParse('', testString, expectedString);
+        });
+    });
+});
diff --git a/commons/regex-weburl.js b/commons/regex-weburl.js