Skip to content

Commit

Permalink
Merge pull request #538 from kabalin/urivalidator
Browse files Browse the repository at this point in the history
Improve URL parsing in input strings
  • Loading branch information
kabalin authored Dec 22, 2022
2 parents 7453bfa + 16bc0ce commit 95d046f
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 137 deletions.
78 changes: 46 additions & 32 deletions commons/Utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ const _ = require('lodash');
const _s = require('underscore.string');
const useragent = require('useragent');
const DMP = require('diff-match-patch');
const regexpUrl = require('./regex-weburl');
const turf = require('@turf/turf');
const ms = require('ms');

Expand Down Expand Up @@ -130,7 +129,22 @@ Utils.isType = function (type, obj) {
Utils.validateEmail = function (email) {
const emailRegexp = /^(([^<>()[\]\\.,;:\s@"]+(\.[^<>()[\]\\.,;:\s@"]+)*)|(".+"))@(([[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/;

return email.toLowerCase().match(emailRegexp);
return !!email.toLowerCase().match(emailRegexp);
};

/**
* Checks if URI is of valid format.
*
* We use Diego Perini's validator retrieved from https://gist.github.com/dperini/729294,
* MIT licensed, version 2018/09/12, see http://mathiasbynens.be/demo/url-regex for details.
*
* @param {string} uri
* @returns {boolean}
*/
Utils.validateURI = function (uri) {
const uriRegexp = /^(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?$/i;

return !!uri.match(uriRegexp);
};

/**
Expand Down Expand Up @@ -282,49 +296,47 @@ Utils.reflectKeys = function (obj) {
Utils.linkifyUrlString = function (text, target, className) {
'use strict';

let matches;

target = target ? ` target="${target}"` : '';
className = className ? ` class="${className}"` : '';

//Используем match и вручную перебираем все совпадающие ссылки, чтобы декодировать их с decodeURI,
//на случай, если ссылка, содержащая не аски символы, вставлена из строки браузера, вида http://ru.wikipedia.org/wiki/%D0%A1%D0%B5%D0%BA%D1%81
//Массив совпадений делаем уникальными (uniq)

//Starting with http://, https://, or ftp://
matches = _.uniq(text.match(regexpUrl));
const replaceLink = function (match, url, punctuation) {
const append = punctuation || '';
let linkText = url;

for (let i = 0; i < matches.length; i++) {
try { // Do nothing if URI malformed (decodeURI fails)
const url = decodeURI(matches[i]);

text = text.replace(matches[i], `<a href="${url}" rel="nofollow noopener"${target}${className}>${url}</a>`);
} catch (err) {}
}

//Starting with "www." (without // before it, or it'd re-link the ones done above).
const matchPattern = /(^|[^/])(www\.[\S]+(\b|$))/gim;
if (/^www\./i.test(url)) {
url = url.replace(/^www\./i, 'http://www.');
}

matches = _.uniq(text.match(matchPattern));
if (!Utils.validateURI(url)) {
// Invalid URL, return original string.
return match;
}

for (let i = 0; i < matches.length; i++) {
try {
matches[i] = _s.trim(matches[i]); //Так как в результат match попадут и переносы и пробелы (^|[^\/]), то надо их удалить

const url = decodeURI(matches[i]);
// Decode URI, e.g. to make http://ru.wikipedia.org/wiki/%D0%A1%D0%B5%D0%BA%D1%81 url readable.
url = decodeURI(url);
linkText = decodeURI(linkText);

return `<a href="${url}" rel="nofollow noopener"${target}${className}>${linkText}</a>${append}`;
} catch (err) {
// Malformed URI sequence, return original string.
return match;
}
};

text = text.replace(matches[i], `<a href="http://${url}" rel="nofollow noopener"${target}${className}>${url}</a>`);
} catch (err) {}
}
// Capture url starting with http://, https://, ftp:// or www, keep
// trailing punctuation ([.!?()]) in a separate group, so we append it later.
const simpleURLRegex = /\b((?:(?:https?|ftp):\/\/|www\.)[^'">\s]+\.[^'">\s]+?)([.,;!?)]?)(?=\s|$)/gmi;

return text;
return text.replace(simpleURLRegex, replaceLink);
};

Utils.inputIncomingParse = (function () {
'use strict';

const host = config.client.host;
const reversedEscapeChars = { '<': 'lt', '>': 'gt', '"': 'quot', '&': 'amp', "'": '#39' };
const trailingChars = '\s).,;>!?'; // eslint-disable-line no-useless-escape

function escape(txt) {
//Паттерн из _s.escapeHTML(result); исключая амперсант
Expand All @@ -338,15 +350,17 @@ Utils.inputIncomingParse = (function () {

//Заменяем ссылку на фото на диез-ссылку #xxx
//Например, http://domain.com/p/123456 -> #123456
result = result.replace(new RegExp(`(\\b)(?:https?://)?(?:www.)?${host}/p/(\\d{1,8})/?(?=[\\s\\)\\.,;>]|$)`, 'gi'), '$1#$2');
result = result.replace(new RegExp(`(\\b)(?:https?://)?(?:www.)?${host}/p/(\\d{1,8})/?(?=[${trailingChars}]|$)`, 'gi'), '$1#$2');
// /p/123456 -> #123456
result = result.replace(new RegExp(`(^|\\s|\\()/p/(\\d{1,8})/?(?=[${trailingChars}]|$)`, 'gi'), '$1#$2');

//Все внутрипортальные ссылки оставляем без доменного имени, от корня
//Например, http://domain.com/u/klimashkin/photo -> /u/klimashkin/photo
result = result.replace(new RegExp(`(\\b)(?:https?://)?(?:www.)?${host}(/[-A-Z0-9+&@#\\/%?=~_|!:,.;]*[-A-Z0-9+&@#\\/%=~_|])`, 'gim'), '$1$2');

// Replace links to protected/covered photos with regular link
// For example, /_pr/a/b/c/abc.jpg -> /_p/a/b/c/abc.jpg
result = result.replace(/\/_prn?\/([/a-z0-9]{26,40}\.(?:jpe?g|png))/gi, '/_p/$1');
result = result.replace(/\/_prn?\/([/a-z0-9]+\.(?:jpe?g|png))/gi, '/_p/$1');

const plain = result;

Expand All @@ -359,7 +373,7 @@ Utils.inputIncomingParse = (function () {

//Заменяем диез-ссылку фото #xxx на линк
//Например, #123456 -> <a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>
result = result.replace(/(^|\s|\()#(\d{1,8})(?=[\s).,]|$)/g, '$1<a target="_blank" class="sharpPhoto" href="/p/$2">#$2</a>');
result = result.replace(new RegExp(`(^|\\s|\\()#(\\d{1,8})(?=[${trailingChars}]|$)`, 'g'), '$1<a target="_blank" class="sharpPhoto" href="/p/$2">#$2</a>');

result = Utils.linkifyUrlString(result, '_blank'); //Оборачиваем остальные url в ahref
result = result.replace(/\n{3,}/g, '<br><br>').replace(/\n/g, '<br>'); //Заменяем переносы на <br>
Expand Down
95 changes: 95 additions & 0 deletions commons/__tests__/Utils.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/**
* Copyright: The PastVu contributors.
* GNU Affero General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/agpl.txt)
*/

import Utils from '../Utils';
import config from '../../config';

const origin = config.client.origin;

/**
* Test Utils.inputIncomingParse output matches expected.
*/
const testInputIncomingParse = (desc, testString, expectedString) => {
expect.assertions(1);
expect(Utils.inputIncomingParse(testString).result).toStrictEqual(expectedString);
};

describe('utils', () => {
describe('incoming input parsing', () => {
describe('should strip spaces and replace next line', () => {
const testData = [
['string with spaces', ' String with spaces in the middle and at both ends ', 'String with spaces in the middle and at both ends'],
['multiline string 1', `line
another line`, 'line<br> another line'],
['multiline string 2', `line
another line`, 'line<br><br> another line'],
['multiline string 3', `line
another line`, 'line<br><br> another line'],
];

it.each(testData)('%s', testInputIncomingParse); // eslint-disable-line jest/expect-expect
});

describe('should replace internal links', () => {
const testData = [
['replace photo url', `${origin}/p/123456`, '<a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>'],
['replace photo path', '/p/123456', '<a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>'],
['replace photo hash', '#123456', '<a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>'],
['replace encoded url', 'https://ru.wikipedia.org/wiki/%D0%A4%D0%BE%D1%82%D0%BE%D0%B3%D1%80%D0%B0%D1%84%D0%B8%D1%8F', '<a href="https://ru.wikipedia.org/wiki/Фотография" rel="nofollow noopener" target="_blank">https://ru.wikipedia.org/wiki/Фотография</a>'],
['shorten internal url', `${origin}/u/klimashkin/photo`, '<a target="_blank" class="innerLink" href="/u/klimashkin/photo">/u/klimashkin/photo</a>'],
['replace internal path', '/u/klimashkin/photo', '<a target="_blank" class="innerLink" href="/u/klimashkin/photo">/u/klimashkin/photo</a>'],
['replace protected photo url', `${origin}/_pr/a/b/c/abc.jpg`, '<a target="_blank" class="innerLink" href="/_p/a/b/c/abc.jpg">/_p/a/b/c/abc.jpg</a>'],
['replace protected photo url 1', `${origin}/_prn/a/b/c/abc.png`, '<a target="_blank" class="innerLink" href="/_p/a/b/c/abc.png">/_p/a/b/c/abc.png</a>'],
];

it.each(testData)('%s', testInputIncomingParse); // eslint-disable-line jest/expect-expect
});

describe('should respect heading and trailing punctuation for internal links', () => {
const testData = [
['photo url', `(${origin}/p/123456) #123456.`, '(<a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>) <a target="_blank" class="sharpPhoto" href="/p/123456">#123456</a>.'],
['internal url', `${origin}/u/klimashkin/photo, ${origin}/u/klimashkin/photo; (/u/klimashkin/photo)`, '<a target="_blank" class="innerLink" href="/u/klimashkin/photo">/u/klimashkin/photo</a>, <a target="_blank" class="innerLink" href="/u/klimashkin/photo">/u/klimashkin/photo</a>; (<a target="_blank" class="innerLink" href="/u/klimashkin/photo">/u/klimashkin/photo</a>)'],
];

it.each(testData)('%s', testInputIncomingParse); // eslint-disable-line jest/expect-expect
});

describe('should replace external links', () => {
const testData = [
['replace url', 'https://jestjs.io/docs/expect#expectassertionsnumber', '<a href="https://jestjs.io/docs/expect#expectassertionsnumber" rel="nofollow noopener" target="_blank">https://jestjs.io/docs/expect#expectassertionsnumber</a>'],
['replace www url', 'www.moodle.org', '<a href="http://www.moodle.org" rel="nofollow noopener" target="_blank">www.moodle.org</a>'],
['replace url with params', 'https://jestjs.io/docs/expect?show=all', '<a href="https://jestjs.io/docs/expect?show=all" rel="nofollow noopener" target="_blank">https://jestjs.io/docs/expect?show=all</a>'],
];

it.each(testData)('%s', testInputIncomingParse); // eslint-disable-line jest/expect-expect
});

describe('should replace external links with punctuation', () => {
const testData = [
['replace url', 'Please check https://jestjs.io/docs/expect. This is important.', 'Please check <a href="https://jestjs.io/docs/expect" rel="nofollow noopener" target="_blank">https://jestjs.io/docs/expect</a>. This is important.'],
['replace urls multiline', `Check www.github.com,
and also http://docs.pastvu.com;`, 'Check <a href="http://www.github.com" rel="nofollow noopener" target="_blank">www.github.com</a>,<br> and also <a href="http://docs.pastvu.com" rel="nofollow noopener" target="_blank">http://docs.pastvu.com</a>;'],
['replace identical urls', 'Please check https://jestjs.io/docs/expect, https://jestjs.io/docs/expect.', 'Please check <a href="https://jestjs.io/docs/expect" rel="nofollow noopener" target="_blank">https://jestjs.io/docs/expect</a>, <a href="https://jestjs.io/docs/expect" rel="nofollow noopener" target="_blank">https://jestjs.io/docs/expect</a>.'],
];

it.each(testData)('%s', testInputIncomingParse); // eslint-disable-line jest/expect-expect
});

it('should replace links in complex example', () => {
expect.assertions(1);

const testString = `Hello /u/testuser, this photo #123456 (also #123457, #456789)
are related and taken from the http://oldtown.com.
Please amend the sources. You can find more information on https://docs.pastvu.com; https://docs.pastvu.com?id=3.`;

const expectedString = 'Hello <a target="_blank" class="innerLink" href="/u/testuser">/u/testuser</a>, this photo #123456 (also <a target="_blank" class="sharpPhoto" href="/p/123457">#123457</a>, <a target="_blank" class="sharpPhoto" href="/p/456789">#456789</a>)<br> are related and taken from the <a href="http://oldtown.com" rel="nofollow noopener" target="_blank">http://oldtown.com</a>.<br> Please amend the sources. You can find more information on <a href="https://docs.pastvu.com" rel="nofollow noopener" target="_blank">https://docs.pastvu.com</a>; <a href="https://docs.pastvu.com?id=3" rel="nofollow noopener" target="_blank">https://docs.pastvu.com?id=3</a>.';

testInputIncomingParse('', testString, expectedString);
});
});
});
105 changes: 0 additions & 105 deletions commons/regex-weburl.js

This file was deleted.

0 comments on commit 95d046f

Please sign in to comment.