diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..1fe2faa7 --- /dev/null +++ b/404.html @@ -0,0 +1,576 @@ + + + + + + + + + + + + + + + + + + Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ +

404 - Not found

+ +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/_static/collab.png b/_static/collab.png new file mode 100644 index 00000000..cadfe05c Binary files /dev/null and b/_static/collab.png differ diff --git a/_static/dev_container.png b/_static/dev_container.png new file mode 100644 index 00000000..eeccffd5 Binary files /dev/null and b/_static/dev_container.png differ diff --git a/_static/icon.png b/_static/icon.png new file mode 100644 index 00000000..399ddd51 Binary files /dev/null and b/_static/icon.png differ diff --git a/_static/logo.png b/_static/logo.png new file mode 100644 index 00000000..a0b81d43 Binary files /dev/null and b/_static/logo.png differ diff --git a/_static/structure.png b/_static/structure.png new file mode 100644 index 00000000..f8acfc9b Binary files /dev/null and b/_static/structure.png differ diff --git a/assets/images/favicon.png b/assets/images/favicon.png new file mode 100644 index 00000000..1cf13b9f Binary files /dev/null and b/assets/images/favicon.png differ diff --git a/assets/javascripts/bundle.220ee61c.min.js b/assets/javascripts/bundle.220ee61c.min.js new file mode 100644 index 00000000..116072a1 --- /dev/null +++ b/assets/javascripts/bundle.220ee61c.min.js @@ -0,0 +1,29 @@ +"use strict";(()=>{var Ci=Object.create;var gr=Object.defineProperty;var Ri=Object.getOwnPropertyDescriptor;var ki=Object.getOwnPropertyNames,Ht=Object.getOwnPropertySymbols,Hi=Object.getPrototypeOf,yr=Object.prototype.hasOwnProperty,nn=Object.prototype.propertyIsEnumerable;var rn=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,P=(e,t)=>{for(var r in t||(t={}))yr.call(t,r)&&rn(e,r,t[r]);if(Ht)for(var r of Ht(t))nn.call(t,r)&&rn(e,r,t[r]);return e};var on=(e,t)=>{var r={};for(var n in e)yr.call(e,n)&&t.indexOf(n)<0&&(r[n]=e[n]);if(e!=null&&Ht)for(var n of Ht(e))t.indexOf(n)<0&&nn.call(e,n)&&(r[n]=e[n]);return r};var Pt=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var Pi=(e,t,r,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of ki(t))!yr.call(e,o)&&o!==r&&gr(e,o,{get:()=>t[o],enumerable:!(n=Ri(t,o))||n.enumerable});return e};var yt=(e,t,r)=>(r=e!=null?Ci(Hi(e)):{},Pi(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var sn=Pt((xr,an)=>{(function(e,t){typeof xr=="object"&&typeof an!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(xr,function(){"use strict";function e(r){var n=!0,o=!1,i=null,s={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function a(O){return!!(O&&O!==document&&O.nodeName!=="HTML"&&O.nodeName!=="BODY"&&"classList"in O&&"contains"in O.classList)}function f(O){var Qe=O.type,De=O.tagName;return!!(De==="INPUT"&&s[Qe]&&!O.readOnly||De==="TEXTAREA"&&!O.readOnly||O.isContentEditable)}function c(O){O.classList.contains("focus-visible")||(O.classList.add("focus-visible"),O.setAttribute("data-focus-visible-added",""))}function u(O){O.hasAttribute("data-focus-visible-added")&&(O.classList.remove("focus-visible"),O.removeAttribute("data-focus-visible-added"))}function p(O){O.metaKey||O.altKey||O.ctrlKey||(a(r.activeElement)&&c(r.activeElement),n=!0)}function m(O){n=!1}function d(O){a(O.target)&&(n||f(O.target))&&c(O.target)}function h(O){a(O.target)&&(O.target.classList.contains("focus-visible")||O.target.hasAttribute("data-focus-visible-added"))&&(o=!0,window.clearTimeout(i),i=window.setTimeout(function(){o=!1},100),u(O.target))}function v(O){document.visibilityState==="hidden"&&(o&&(n=!0),Y())}function Y(){document.addEventListener("mousemove",N),document.addEventListener("mousedown",N),document.addEventListener("mouseup",N),document.addEventListener("pointermove",N),document.addEventListener("pointerdown",N),document.addEventListener("pointerup",N),document.addEventListener("touchmove",N),document.addEventListener("touchstart",N),document.addEventListener("touchend",N)}function B(){document.removeEventListener("mousemove",N),document.removeEventListener("mousedown",N),document.removeEventListener("mouseup",N),document.removeEventListener("pointermove",N),document.removeEventListener("pointerdown",N),document.removeEventListener("pointerup",N),document.removeEventListener("touchmove",N),document.removeEventListener("touchstart",N),document.removeEventListener("touchend",N)}function N(O){O.target.nodeName&&O.target.nodeName.toLowerCase()==="html"||(n=!1,B())}document.addEventListener("keydown",p,!0),document.addEventListener("mousedown",m,!0),document.addEventListener("pointerdown",m,!0),document.addEventListener("touchstart",m,!0),document.addEventListener("visibilitychange",v,!0),Y(),r.addEventListener("focus",d,!0),r.addEventListener("blur",h,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var cn=Pt(Er=>{(function(e){var t=function(){try{return!!Symbol.iterator}catch(c){return!1}},r=t(),n=function(c){var u={next:function(){var p=c.shift();return{done:p===void 0,value:p}}};return r&&(u[Symbol.iterator]=function(){return u}),u},o=function(c){return encodeURIComponent(c).replace(/%20/g,"+")},i=function(c){return decodeURIComponent(String(c).replace(/\+/g," "))},s=function(){var c=function(p){Object.defineProperty(this,"_entries",{writable:!0,value:{}});var m=typeof p;if(m!=="undefined")if(m==="string")p!==""&&this._fromString(p);else if(p instanceof c){var d=this;p.forEach(function(B,N){d.append(N,B)})}else if(p!==null&&m==="object")if(Object.prototype.toString.call(p)==="[object Array]")for(var h=0;hd[0]?1:0}),c._entries&&(c._entries={});for(var p=0;p1?i(d[1]):"")}})})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er);(function(e){var t=function(){try{var o=new e.URL("b","http://a");return o.pathname="c d",o.href==="http://a/c%20d"&&o.searchParams}catch(i){return!1}},r=function(){var o=e.URL,i=function(f,c){typeof f!="string"&&(f=String(f)),c&&typeof c!="string"&&(c=String(c));var u=document,p;if(c&&(e.location===void 0||c!==e.location.href)){c=c.toLowerCase(),u=document.implementation.createHTMLDocument(""),p=u.createElement("base"),p.href=c,u.head.appendChild(p);try{if(p.href.indexOf(c)!==0)throw new Error(p.href)}catch(O){throw new Error("URL unable to set base "+c+" due to "+O)}}var m=u.createElement("a");m.href=f,p&&(u.body.appendChild(m),m.href=m.href);var d=u.createElement("input");if(d.type="url",d.value=f,m.protocol===":"||!/:/.test(m.href)||!d.checkValidity()&&!c)throw new TypeError("Invalid URL");Object.defineProperty(this,"_anchorElement",{value:m});var h=new e.URLSearchParams(this.search),v=!0,Y=!0,B=this;["append","delete","set"].forEach(function(O){var Qe=h[O];h[O]=function(){Qe.apply(h,arguments),v&&(Y=!1,B.search=h.toString(),Y=!0)}}),Object.defineProperty(this,"searchParams",{value:h,enumerable:!0});var N=void 0;Object.defineProperty(this,"_updateSearchParams",{enumerable:!1,configurable:!1,writable:!1,value:function(){this.search!==N&&(N=this.search,Y&&(v=!1,this.searchParams._fromString(this.search),v=!0))}})},s=i.prototype,a=function(f){Object.defineProperty(s,f,{get:function(){return this._anchorElement[f]},set:function(c){this._anchorElement[f]=c},enumerable:!0})};["hash","host","hostname","port","protocol"].forEach(function(f){a(f)}),Object.defineProperty(s,"search",{get:function(){return this._anchorElement.search},set:function(f){this._anchorElement.search=f,this._updateSearchParams()},enumerable:!0}),Object.defineProperties(s,{toString:{get:function(){var f=this;return function(){return f.href}}},href:{get:function(){return this._anchorElement.href.replace(/\?$/,"")},set:function(f){this._anchorElement.href=f,this._updateSearchParams()},enumerable:!0},pathname:{get:function(){return this._anchorElement.pathname.replace(/(^\/?)/,"/")},set:function(f){this._anchorElement.pathname=f},enumerable:!0},origin:{get:function(){var f={"http:":80,"https:":443,"ftp:":21}[this._anchorElement.protocol],c=this._anchorElement.port!=f&&this._anchorElement.port!=="";return this._anchorElement.protocol+"//"+this._anchorElement.hostname+(c?":"+this._anchorElement.port:"")},enumerable:!0},password:{get:function(){return""},set:function(f){},enumerable:!0},username:{get:function(){return""},set:function(f){},enumerable:!0}}),i.createObjectURL=function(f){return o.createObjectURL.apply(o,arguments)},i.revokeObjectURL=function(f){return o.revokeObjectURL.apply(o,arguments)},e.URL=i};if(t()||r(),e.location!==void 0&&!("origin"in e.location)){var n=function(){return e.location.protocol+"//"+e.location.hostname+(e.location.port?":"+e.location.port:"")};try{Object.defineProperty(e.location,"origin",{get:n,enumerable:!0})}catch(o){setInterval(function(){e.location.origin=n()},100)}}})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er)});var qr=Pt((Mt,Nr)=>{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof Mt=="object"&&typeof Nr=="object"?Nr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof Mt=="object"?Mt.ClipboardJS=r():t.ClipboardJS=r()})(Mt,function(){return function(){var e={686:function(n,o,i){"use strict";i.d(o,{default:function(){return Ai}});var s=i(279),a=i.n(s),f=i(370),c=i.n(f),u=i(817),p=i.n(u);function m(j){try{return document.execCommand(j)}catch(T){return!1}}var d=function(T){var E=p()(T);return m("cut"),E},h=d;function v(j){var T=document.documentElement.getAttribute("dir")==="rtl",E=document.createElement("textarea");E.style.fontSize="12pt",E.style.border="0",E.style.padding="0",E.style.margin="0",E.style.position="absolute",E.style[T?"right":"left"]="-9999px";var H=window.pageYOffset||document.documentElement.scrollTop;return E.style.top="".concat(H,"px"),E.setAttribute("readonly",""),E.value=j,E}var Y=function(T,E){var H=v(T);E.container.appendChild(H);var I=p()(H);return m("copy"),H.remove(),I},B=function(T){var E=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},H="";return typeof T=="string"?H=Y(T,E):T instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(T==null?void 0:T.type)?H=Y(T.value,E):(H=p()(T),m("copy")),H},N=B;function O(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?O=function(E){return typeof E}:O=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},O(j)}var Qe=function(){var T=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},E=T.action,H=E===void 0?"copy":E,I=T.container,q=T.target,Me=T.text;if(H!=="copy"&&H!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(q!==void 0)if(q&&O(q)==="object"&&q.nodeType===1){if(H==="copy"&&q.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(H==="cut"&&(q.hasAttribute("readonly")||q.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(Me)return N(Me,{container:I});if(q)return H==="cut"?h(q):N(q,{container:I})},De=Qe;function $e(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?$e=function(E){return typeof E}:$e=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},$e(j)}function Ei(j,T){if(!(j instanceof T))throw new TypeError("Cannot call a class as a function")}function tn(j,T){for(var E=0;E0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof I.action=="function"?I.action:this.defaultAction,this.target=typeof I.target=="function"?I.target:this.defaultTarget,this.text=typeof I.text=="function"?I.text:this.defaultText,this.container=$e(I.container)==="object"?I.container:document.body}},{key:"listenClick",value:function(I){var q=this;this.listener=c()(I,"click",function(Me){return q.onClick(Me)})}},{key:"onClick",value:function(I){var q=I.delegateTarget||I.currentTarget,Me=this.action(q)||"copy",kt=De({action:Me,container:this.container,target:this.target(q),text:this.text(q)});this.emit(kt?"success":"error",{action:Me,text:kt,trigger:q,clearSelection:function(){q&&q.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(I){return vr("action",I)}},{key:"defaultTarget",value:function(I){var q=vr("target",I);if(q)return document.querySelector(q)}},{key:"defaultText",value:function(I){return vr("text",I)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(I){var q=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return N(I,q)}},{key:"cut",value:function(I){return h(I)}},{key:"isSupported",value:function(){var I=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],q=typeof I=="string"?[I]:I,Me=!!document.queryCommandSupported;return q.forEach(function(kt){Me=Me&&!!document.queryCommandSupported(kt)}),Me}}]),E}(a()),Ai=Li},828:function(n){var o=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function s(a,f){for(;a&&a.nodeType!==o;){if(typeof a.matches=="function"&&a.matches(f))return a;a=a.parentNode}}n.exports=s},438:function(n,o,i){var s=i(828);function a(u,p,m,d,h){var v=c.apply(this,arguments);return u.addEventListener(m,v,h),{destroy:function(){u.removeEventListener(m,v,h)}}}function f(u,p,m,d,h){return typeof u.addEventListener=="function"?a.apply(null,arguments):typeof m=="function"?a.bind(null,document).apply(null,arguments):(typeof u=="string"&&(u=document.querySelectorAll(u)),Array.prototype.map.call(u,function(v){return a(v,p,m,d,h)}))}function c(u,p,m,d){return function(h){h.delegateTarget=s(h.target,p),h.delegateTarget&&d.call(u,h)}}n.exports=f},879:function(n,o){o.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},o.nodeList=function(i){var s=Object.prototype.toString.call(i);return i!==void 0&&(s==="[object NodeList]"||s==="[object HTMLCollection]")&&"length"in i&&(i.length===0||o.node(i[0]))},o.string=function(i){return typeof i=="string"||i instanceof String},o.fn=function(i){var s=Object.prototype.toString.call(i);return s==="[object Function]"}},370:function(n,o,i){var s=i(879),a=i(438);function f(m,d,h){if(!m&&!d&&!h)throw new Error("Missing required arguments");if(!s.string(d))throw new TypeError("Second argument must be a String");if(!s.fn(h))throw new TypeError("Third argument must be a Function");if(s.node(m))return c(m,d,h);if(s.nodeList(m))return u(m,d,h);if(s.string(m))return p(m,d,h);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(m,d,h){return m.addEventListener(d,h),{destroy:function(){m.removeEventListener(d,h)}}}function u(m,d,h){return Array.prototype.forEach.call(m,function(v){v.addEventListener(d,h)}),{destroy:function(){Array.prototype.forEach.call(m,function(v){v.removeEventListener(d,h)})}}}function p(m,d,h){return a(document.body,m,d,h)}n.exports=f},817:function(n){function o(i){var s;if(i.nodeName==="SELECT")i.focus(),s=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var a=i.hasAttribute("readonly");a||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),a||i.removeAttribute("readonly"),s=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var f=window.getSelection(),c=document.createRange();c.selectNodeContents(i),f.removeAllRanges(),f.addRange(c),s=f.toString()}return s}n.exports=o},279:function(n){function o(){}o.prototype={on:function(i,s,a){var f=this.e||(this.e={});return(f[i]||(f[i]=[])).push({fn:s,ctx:a}),this},once:function(i,s,a){var f=this;function c(){f.off(i,c),s.apply(a,arguments)}return c._=s,this.on(i,c,a)},emit:function(i){var s=[].slice.call(arguments,1),a=((this.e||(this.e={}))[i]||[]).slice(),f=0,c=a.length;for(f;f{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var rs=/["'&<>]/;Yo.exports=ns;function ns(e){var t=""+e,r=rs.exec(t);if(!r)return t;var n,o="",i=0,s=0;for(i=r.index;i0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]=e.length&&(e=void 0),{value:e&&e[n++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function W(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var n=r.call(e),o,i=[],s;try{for(;(t===void 0||t-- >0)&&!(o=n.next()).done;)i.push(o.value)}catch(a){s={error:a}}finally{try{o&&!o.done&&(r=n.return)&&r.call(n)}finally{if(s)throw s.error}}return i}function D(e,t,r){if(r||arguments.length===2)for(var n=0,o=t.length,i;n1||a(m,d)})})}function a(m,d){try{f(n[m](d))}catch(h){p(i[0][3],h)}}function f(m){m.value instanceof et?Promise.resolve(m.value.v).then(c,u):p(i[0][2],m)}function c(m){a("next",m)}function u(m){a("throw",m)}function p(m,d){m(d),i.shift(),i.length&&a(i[0][0],i[0][1])}}function pn(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof Ee=="function"?Ee(e):e[Symbol.iterator](),r={},n("next"),n("throw"),n("return"),r[Symbol.asyncIterator]=function(){return this},r);function n(i){r[i]=e[i]&&function(s){return new Promise(function(a,f){s=e[i](s),o(a,f,s.done,s.value)})}}function o(i,s,a,f){Promise.resolve(f).then(function(c){i({value:c,done:a})},s)}}function C(e){return typeof e=="function"}function at(e){var t=function(n){Error.call(n),n.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var It=at(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(n,o){return o+1+") "+n.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function Ve(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ie=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,n,o,i;if(!this.closed){this.closed=!0;var s=this._parentage;if(s)if(this._parentage=null,Array.isArray(s))try{for(var a=Ee(s),f=a.next();!f.done;f=a.next()){var c=f.value;c.remove(this)}}catch(v){t={error:v}}finally{try{f&&!f.done&&(r=a.return)&&r.call(a)}finally{if(t)throw t.error}}else s.remove(this);var u=this.initialTeardown;if(C(u))try{u()}catch(v){i=v instanceof It?v.errors:[v]}var p=this._finalizers;if(p){this._finalizers=null;try{for(var m=Ee(p),d=m.next();!d.done;d=m.next()){var h=d.value;try{ln(h)}catch(v){i=i!=null?i:[],v instanceof It?i=D(D([],W(i)),W(v.errors)):i.push(v)}}}catch(v){n={error:v}}finally{try{d&&!d.done&&(o=m.return)&&o.call(m)}finally{if(n)throw n.error}}}if(i)throw new It(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)ln(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Ve(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Ve(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Sr=Ie.EMPTY;function jt(e){return e instanceof Ie||e&&"closed"in e&&C(e.remove)&&C(e.add)&&C(e.unsubscribe)}function ln(e){C(e)?e():e.unsubscribe()}var Le={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var st={setTimeout:function(e,t){for(var r=[],n=2;n0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var n=this,o=this,i=o.hasError,s=o.isStopped,a=o.observers;return i||s?Sr:(this.currentObservers=null,a.push(r),new Ie(function(){n.currentObservers=null,Ve(a,r)}))},t.prototype._checkFinalizedStatuses=function(r){var n=this,o=n.hasError,i=n.thrownError,s=n.isStopped;o?r.error(i):s&&r.complete()},t.prototype.asObservable=function(){var r=new F;return r.source=this,r},t.create=function(r,n){return new xn(r,n)},t}(F);var xn=function(e){ie(t,e);function t(r,n){var o=e.call(this)||this;return o.destination=r,o.source=n,o}return t.prototype.next=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.next)===null||o===void 0||o.call(n,r)},t.prototype.error=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.error)===null||o===void 0||o.call(n,r)},t.prototype.complete=function(){var r,n;(n=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||n===void 0||n.call(r)},t.prototype._subscribe=function(r){var n,o;return(o=(n=this.source)===null||n===void 0?void 0:n.subscribe(r))!==null&&o!==void 0?o:Sr},t}(x);var Et={now:function(){return(Et.delegate||Date).now()},delegate:void 0};var wt=function(e){ie(t,e);function t(r,n,o){r===void 0&&(r=1/0),n===void 0&&(n=1/0),o===void 0&&(o=Et);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=n,i._timestampProvider=o,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=n===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,n),i}return t.prototype.next=function(r){var n=this,o=n.isStopped,i=n._buffer,s=n._infiniteTimeWindow,a=n._timestampProvider,f=n._windowTime;o||(i.push(r),!s&&i.push(a.now()+f)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var n=this._innerSubscribe(r),o=this,i=o._infiniteTimeWindow,s=o._buffer,a=s.slice(),f=0;f0?e.prototype.requestAsyncId.call(this,r,n,o):(r.actions.push(this),r._scheduled||(r._scheduled=ut.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,n,o){var i;if(o===void 0&&(o=0),o!=null?o>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,n,o);var s=r.actions;n!=null&&((i=s[s.length-1])===null||i===void 0?void 0:i.id)!==n&&(ut.cancelAnimationFrame(n),r._scheduled=void 0)},t}(Wt);var Sn=function(e){ie(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var n=this._scheduled;this._scheduled=void 0;var o=this.actions,i;r=r||o.shift();do if(i=r.execute(r.state,r.delay))break;while((r=o[0])&&r.id===n&&o.shift());if(this._active=!1,i){for(;(r=o[0])&&r.id===n&&o.shift();)r.unsubscribe();throw i}},t}(Dt);var Oe=new Sn(wn);var M=new F(function(e){return e.complete()});function Vt(e){return e&&C(e.schedule)}function Cr(e){return e[e.length-1]}function Ye(e){return C(Cr(e))?e.pop():void 0}function Te(e){return Vt(Cr(e))?e.pop():void 0}function zt(e,t){return typeof Cr(e)=="number"?e.pop():t}var pt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Nt(e){return C(e==null?void 0:e.then)}function qt(e){return C(e[ft])}function Kt(e){return Symbol.asyncIterator&&C(e==null?void 0:e[Symbol.asyncIterator])}function Qt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function zi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var Yt=zi();function Gt(e){return C(e==null?void 0:e[Yt])}function Bt(e){return un(this,arguments,function(){var r,n,o,i;return $t(this,function(s){switch(s.label){case 0:r=e.getReader(),s.label=1;case 1:s.trys.push([1,,9,10]),s.label=2;case 2:return[4,et(r.read())];case 3:return n=s.sent(),o=n.value,i=n.done,i?[4,et(void 0)]:[3,5];case 4:return[2,s.sent()];case 5:return[4,et(o)];case 6:return[4,s.sent()];case 7:return s.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function Jt(e){return C(e==null?void 0:e.getReader)}function U(e){if(e instanceof F)return e;if(e!=null){if(qt(e))return Ni(e);if(pt(e))return qi(e);if(Nt(e))return Ki(e);if(Kt(e))return On(e);if(Gt(e))return Qi(e);if(Jt(e))return Yi(e)}throw Qt(e)}function Ni(e){return new F(function(t){var r=e[ft]();if(C(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function qi(e){return new F(function(t){for(var r=0;r=2;return function(n){return n.pipe(e?A(function(o,i){return e(o,i,n)}):de,ge(1),r?He(t):Dn(function(){return new Zt}))}}function Vn(){for(var e=[],t=0;t=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new x}:t,n=e.resetOnError,o=n===void 0?!0:n,i=e.resetOnComplete,s=i===void 0?!0:i,a=e.resetOnRefCountZero,f=a===void 0?!0:a;return function(c){var u,p,m,d=0,h=!1,v=!1,Y=function(){p==null||p.unsubscribe(),p=void 0},B=function(){Y(),u=m=void 0,h=v=!1},N=function(){var O=u;B(),O==null||O.unsubscribe()};return y(function(O,Qe){d++,!v&&!h&&Y();var De=m=m!=null?m:r();Qe.add(function(){d--,d===0&&!v&&!h&&(p=$r(N,f))}),De.subscribe(Qe),!u&&d>0&&(u=new rt({next:function($e){return De.next($e)},error:function($e){v=!0,Y(),p=$r(B,o,$e),De.error($e)},complete:function(){h=!0,Y(),p=$r(B,s),De.complete()}}),U(O).subscribe(u))})(c)}}function $r(e,t){for(var r=[],n=2;ne.next(document)),e}function K(e,t=document){return Array.from(t.querySelectorAll(e))}function z(e,t=document){let r=ce(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function ce(e,t=document){return t.querySelector(e)||void 0}function _e(){return document.activeElement instanceof HTMLElement&&document.activeElement||void 0}function tr(e){return L(b(document.body,"focusin"),b(document.body,"focusout")).pipe(ke(1),l(()=>{let t=_e();return typeof t!="undefined"?e.contains(t):!1}),V(e===_e()),J())}function Xe(e){return{x:e.offsetLeft,y:e.offsetTop}}function Kn(e){return L(b(window,"load"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>Xe(e)),V(Xe(e)))}function rr(e){return{x:e.scrollLeft,y:e.scrollTop}}function dt(e){return L(b(e,"scroll"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>rr(e)),V(rr(e)))}var Yn=function(){if(typeof Map!="undefined")return Map;function e(t,r){var n=-1;return t.some(function(o,i){return o[0]===r?(n=i,!0):!1}),n}return function(){function t(){this.__entries__=[]}return Object.defineProperty(t.prototype,"size",{get:function(){return this.__entries__.length},enumerable:!0,configurable:!0}),t.prototype.get=function(r){var n=e(this.__entries__,r),o=this.__entries__[n];return o&&o[1]},t.prototype.set=function(r,n){var o=e(this.__entries__,r);~o?this.__entries__[o][1]=n:this.__entries__.push([r,n])},t.prototype.delete=function(r){var n=this.__entries__,o=e(n,r);~o&&n.splice(o,1)},t.prototype.has=function(r){return!!~e(this.__entries__,r)},t.prototype.clear=function(){this.__entries__.splice(0)},t.prototype.forEach=function(r,n){n===void 0&&(n=null);for(var o=0,i=this.__entries__;o0},e.prototype.connect_=function(){!Wr||this.connected_||(document.addEventListener("transitionend",this.onTransitionEnd_),window.addEventListener("resize",this.refresh),va?(this.mutationsObserver_=new MutationObserver(this.refresh),this.mutationsObserver_.observe(document,{attributes:!0,childList:!0,characterData:!0,subtree:!0})):(document.addEventListener("DOMSubtreeModified",this.refresh),this.mutationEventsAdded_=!0),this.connected_=!0)},e.prototype.disconnect_=function(){!Wr||!this.connected_||(document.removeEventListener("transitionend",this.onTransitionEnd_),window.removeEventListener("resize",this.refresh),this.mutationsObserver_&&this.mutationsObserver_.disconnect(),this.mutationEventsAdded_&&document.removeEventListener("DOMSubtreeModified",this.refresh),this.mutationsObserver_=null,this.mutationEventsAdded_=!1,this.connected_=!1)},e.prototype.onTransitionEnd_=function(t){var r=t.propertyName,n=r===void 0?"":r,o=ba.some(function(i){return!!~n.indexOf(i)});o&&this.refresh()},e.getInstance=function(){return this.instance_||(this.instance_=new e),this.instance_},e.instance_=null,e}(),Gn=function(e,t){for(var r=0,n=Object.keys(t);r0},e}(),Jn=typeof WeakMap!="undefined"?new WeakMap:new Yn,Xn=function(){function e(t){if(!(this instanceof e))throw new TypeError("Cannot call a class as a function.");if(!arguments.length)throw new TypeError("1 argument required, but only 0 present.");var r=ga.getInstance(),n=new La(t,r,this);Jn.set(this,n)}return e}();["observe","unobserve","disconnect"].forEach(function(e){Xn.prototype[e]=function(){var t;return(t=Jn.get(this))[e].apply(t,arguments)}});var Aa=function(){return typeof nr.ResizeObserver!="undefined"?nr.ResizeObserver:Xn}(),Zn=Aa;var eo=new x,Ca=$(()=>k(new Zn(e=>{for(let t of e)eo.next(t)}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function he(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ye(e){return Ca.pipe(S(t=>t.observe(e)),g(t=>eo.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(()=>he(e)))),V(he(e)))}function bt(e){return{width:e.scrollWidth,height:e.scrollHeight}}function ar(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}var to=new x,Ra=$(()=>k(new IntersectionObserver(e=>{for(let t of e)to.next(t)},{threshold:0}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function sr(e){return Ra.pipe(S(t=>t.observe(e)),g(t=>to.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(({isIntersecting:r})=>r))))}function ro(e,t=16){return dt(e).pipe(l(({y:r})=>{let n=he(e),o=bt(e);return r>=o.height-n.height-t}),J())}var cr={drawer:z("[data-md-toggle=drawer]"),search:z("[data-md-toggle=search]")};function no(e){return cr[e].checked}function Ke(e,t){cr[e].checked!==t&&cr[e].click()}function Ue(e){let t=cr[e];return b(t,"change").pipe(l(()=>t.checked),V(t.checked))}function ka(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function Ha(){return L(b(window,"compositionstart").pipe(l(()=>!0)),b(window,"compositionend").pipe(l(()=>!1))).pipe(V(!1))}function oo(){let e=b(window,"keydown").pipe(A(t=>!(t.metaKey||t.ctrlKey)),l(t=>({mode:no("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),A(({mode:t,type:r})=>{if(t==="global"){let n=_e();if(typeof n!="undefined")return!ka(n,r)}return!0}),pe());return Ha().pipe(g(t=>t?M:e))}function le(){return new URL(location.href)}function ot(e){location.href=e.href}function io(){return new x}function ao(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)ao(e,r)}function _(e,t,...r){let n=document.createElement(e);if(t)for(let o of Object.keys(t))typeof t[o]!="undefined"&&(typeof t[o]!="boolean"?n.setAttribute(o,t[o]):n.setAttribute(o,""));for(let o of r)ao(n,o);return n}function fr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function so(){return location.hash.substring(1)}function Dr(e){let t=_("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Pa(e){return L(b(window,"hashchange"),e).pipe(l(so),V(so()),A(t=>t.length>0),X(1))}function co(e){return Pa(e).pipe(l(t=>ce(`[id="${t}"]`)),A(t=>typeof t!="undefined"))}function Vr(e){let t=matchMedia(e);return er(r=>t.addListener(()=>r(t.matches))).pipe(V(t.matches))}function fo(){let e=matchMedia("print");return L(b(window,"beforeprint").pipe(l(()=>!0)),b(window,"afterprint").pipe(l(()=>!1))).pipe(V(e.matches))}function zr(e,t){return e.pipe(g(r=>r?t():M))}function ur(e,t={credentials:"same-origin"}){return ue(fetch(`${e}`,t)).pipe(fe(()=>M),g(r=>r.status!==200?Ot(()=>new Error(r.statusText)):k(r)))}function We(e,t){return ur(e,t).pipe(g(r=>r.json()),X(1))}function uo(e,t){let r=new DOMParser;return ur(e,t).pipe(g(n=>n.text()),l(n=>r.parseFromString(n,"text/xml")),X(1))}function pr(e){let t=_("script",{src:e});return $(()=>(document.head.appendChild(t),L(b(t,"load"),b(t,"error").pipe(g(()=>Ot(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(l(()=>{}),R(()=>document.head.removeChild(t)),ge(1))))}function po(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function lo(){return L(b(window,"scroll",{passive:!0}),b(window,"resize",{passive:!0})).pipe(l(po),V(po()))}function mo(){return{width:innerWidth,height:innerHeight}}function ho(){return b(window,"resize",{passive:!0}).pipe(l(mo),V(mo()))}function bo(){return G([lo(),ho()]).pipe(l(([e,t])=>({offset:e,size:t})),X(1))}function lr(e,{viewport$:t,header$:r}){let n=t.pipe(ee("size")),o=G([n,r]).pipe(l(()=>Xe(e)));return G([r,t,o]).pipe(l(([{height:i},{offset:s,size:a},{x:f,y:c}])=>({offset:{x:s.x-f,y:s.y-c+i},size:a})))}(()=>{function e(n,o){parent.postMessage(n,o||"*")}function t(...n){return n.reduce((o,i)=>o.then(()=>new Promise(s=>{let a=document.createElement("script");a.src=i,a.onload=s,document.body.appendChild(a)})),Promise.resolve())}var r=class extends EventTarget{constructor(n){super(),this.url=n,this.m=i=>{i.source===this.w&&(this.dispatchEvent(new MessageEvent("message",{data:i.data})),this.onmessage&&this.onmessage(i))},this.e=(i,s,a,f,c)=>{if(s===`${this.url}`){let u=new ErrorEvent("error",{message:i,filename:s,lineno:a,colno:f,error:c});this.dispatchEvent(u),this.onerror&&this.onerror(u)}};let o=document.createElement("iframe");o.hidden=!0,document.body.appendChild(this.iframe=o),this.w.document.open(),this.w.document.write(` + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + +

DaNews

+

Version: 1.0.0

+

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

+

license: Not publicly available.

+
+

DaNews consists of articles from Danish news and tabloid media from 1 December 2019 to +30 April 2021. The articles stem from multiple news sources, including both online of physical newspapers.

+

Datasheet

+

Following the recommendation and framework of [5] we add the following datasheet.

+

Motivation

+

For what purpose was the dataset created? Who created the dataset? Who funded the +creation of the dataset?

+

DANews was collected as a part of the HOPE project, examining news coverage during the COVID-19 pandemic. The purpose was to train a model to understand how the novelty and resonance imprint of COVID-19 as a case of crisis compared to non-crises news imprints.

+

Any other comments?

+

No.

+

Composition

+

What do the instances that comprise the dataset represent (e.g., documents, photos, +people, countries)?

+

Instances of the dataset are Danish articles derived from Danish tabloids or news media.

+

Does the dataset contain all possible instances or is it a sample (not necessarily +random) of instances from a larger set?

+

Prior to filtering DaNews dataset contains all digitized news articles from the given +period across the sources.

+

What data does each instance consist of? “Raw” data (e.g., unprocessed text or images) +or features? In either case, please provide a description.

+

Each instance consists of the following columns +

'ArticleUrl', 'Heading', 'SubHeading', 'Lead', 'Paragraph', 'PublishDate', 'BodyText', 
+'Captions', 'Authors', 'Source', 'WordCount', 'ArticleId', 'PageIds', 'Section', 'text'
+

+

Where we constructed the columns text column by joining the Heading, SubHeading +using newline. If the text field is empty it is ignored and no newline is added. The we +join the resulting string with the BodyText using two newlines.

+

During the quality filtering, we add the following indicator columns: +

'passed_quality_filter', 'filtered_by_max_chr_length', 'filtered_by_doc_length', 
+'filtered_by_mean_word_length', 'filtered_by_alpha_ratio', 'filtered_by_stop_word', 
+'filtered_by_symbol_2_word_hashtag', 'filtered_by_symbol_2_word_ellipsis',
+'filtered_by_line_bullets_or_ellipsis', 'filtered_by_duplicate_lines_chr_fraction',
+'filtered_by_duplicate_paragraph_chr_fraction', 'filtered_by_top_ngram_chr_fraction',
+'filtered_by_duplicate_ngram_chr_fraction', 'is_duplicate'
+

+

Is there a label or target associated with each instance? If so, please provide a +description.

+

No.

+

Is any information missing from individual instances? If so, please provide a +description, explaining why this information is missing (e.g., because it was +unavailable). This does not include intentionally removed information but might +include, e.g., redacted text.

+

The team of researchers at the Humanities Computing Aarhus (CHCAA) have not +removed any information from the instances.

+

Are relationships between individual instances made explicit (e.g., users’ movie +ratings, and social network links)? If so, please describe how these relationships are made +explicit.

+

The metadata columns denote the relationship between articles including the date of +publication, sections, and authors.

+

Are there recommended data splits (e.g., training, development/validation, testing)? +If so, please provide a description of these splits, explaining the rationale behind +them.

+

There are not splits performed on this dataset.

+

Are there any errors, sources of noise, or redundancies in the dataset? If so, please +provide a description.

+

News sources can publish their content both in an online and printed format which would +lead to similar instances in the dataset. To alleviate this redundancy by removing +near-duplicates (see Preprocessing/cleaning/labeling).

+

Is the dataset self-contained, or does it link to or otherwise rely on external +resources (e.g., websites, tweets, other datasets)?

+

Articles are intended to tell a self-contained story but can include external +references such as tweets or website URLs.

+

Does the dataset contain data that, if viewed directly, might be offensive, insulting, +threatening, or might otherwise cause anxiety?

+

Articles often describe content that is considered offensive, insulting, or threatening.

+

Collection Process

+

If the dataset is a sample from a larger set, what was the sampling strategy?

+

The dataset is not a sample, but is a filtered version of the full dataset, see +Preprocessing/cleaning/labeling for more on this.

+

Over what timeframe was the data collected?

+

The dataset includes articles from 1 December 2019 to +30 April 2021.

+

Were any ethical review processes conducted?

+

No.

+

Preprocessing/cleaning/labeling

+

Was any preprocessing/Cleaning/Labeling of the data done +(e.g., discretization or bucketing, tokenization, part-of-speech tagging, +SIFT feature extraction, removal of instances, processing of missing values)?

+

DaNews has been filtered using a series of heuristic filters as well as removing +repetitious texts. Following the filtering, DaNews is deduplicated to remove exact and +near-duplicates.

+

Of all documents, 9% were filtered based due to low-quality and 4% because they were near-duplicates.

+

For quality filtering, DaNews applies a filter akin to [2] which contains text +that:

+
    +
  • Contain at least 2 Danish stopwords. For the stopword list we use the one used in +SpaCy v.3.1.4.
  • +
  • Have a mean word length between 3 and 10.
  • +
  • Have a token length between 50 and 100,000.
  • +
  • Have less than 5,000,000 characters.
  • +
  • Have less than 60% of words containing an alphabetic character.
  • +
  • Have a symbol-to-word ratio lower than 10% for hashtags and ellipsis.
  • +
  • Have less than 90% of lines starting with a bullet point.
  • +
  • +

    have less than 30% of lines ending with an ellipsis.

    +
  • +
  • +

    Have a low high degree of repetitious text:

    +
  • +
  • Have less than 20% of characters contained within duplicate lines.
  • +
  • Have less than 20% of characters contained within duplicate paragraphs.
  • +
  • Where the top 2-4 grams constitute less than 20%, 18%, 16%, respectively, of the text.
  • +
  • Where the duplicate 5-10 grams constitute less than 25%, 24%, 23%, 22%, 21%, 20% +of the text, respectively.
  • +
+

The deduplication removed all documents with a 13-gram Jaccard similarity higher than 80% +following the MinHash algorithm [1] using 128 permutations. The MinHash algorithm is a +probabilistic data structure for approximating the Jaccard similarity between two sets.

+

Is the software used to preprocess/clean/label the instances available?

+

Yes, the scripts are available +here. +the scripts use version 0.0.2 of the +dfm package.

+

Uses

+

Has the dataset been used for any tasks already?

+

Yes, the dataset has been used to pre-train Danish language models. +Parts of the dataset have also been used in [3] and [4]

+

Is there a repository that links to any or all papers or systems that use the dataset?

+

No.

+

What (other) tasks could the dataset be used for?

+

The scale of the dataset makes it suitable for NLP tasks such as language modeling. +Similarly, the structure of the articles makes it a suitable dataset for training text +summarisation models.

+

Is there anything about the composition of the dataset or the way it was collected and +preprocessed/cleaned/labeled that might impact future uses?

+

This dataset is static and thus does not evolve over time with the language. +A consequence of this is that it will become increasingly outdated over time.

+

Are there tasks for which the dataset should not be used?

+

This dataset contains Danish articles and thus should not be used for non-Danish +language tasks.

+

As the writers of the content are predominantly journalists, it reflects a certain +writing style which is unlikely to reflect the Danish language as a whole.

+

Distribution

+

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

+

Data will only be available at the entity during the project. If you wish access to the dataset you will have to come to an agreement with the individuals +Danish newspapers.

+

Citation

+

If you wish to cite this work please see our GitHub page for an up-to-date citation: +https://github.com/centre-for-humanities-computing/danish-foundation-models

+

References:

+
    +
  • [1] Broder, Andrei Z. "On the resemblance and containment of documents." + Proceedings. Compression and Complexity of SEQUENCES 1997 + (Cat. No. 97TB100171). IEEE, 1997.
  • +
  • [2] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., + Aslanides, J., Henderson, S., Ring, R., Young, S., Rutherford, E., Hennigan, + T., Menick, J., Cassirer, A., Powell, R., Driessche, G. van den, Hendricks, + L. A., Rauh, M., Huang, P.-S., … Irving, G. (2021). + Scaling Language Models: Methods, Analysis & Insights from Training Gopher. + https://arxiv.org/abs/2112.11446v2
  • +
  • [3] Baglini, R. B., Nielbo, K. L., Hæstrup, F., Enevoldsen, K., Vahlstrup, P. B., & + Roepstorff, A. (2021, June 2). When no news is bad news: Detection of negative + events from news media content. https://2021.dhbenelux.org/
  • +
  • [4] Nielbo, K. L., Baglini, R. B., Vahlstrup, P. B., Enevoldsen, K. C., Bechmann, A., + & Roepstorff, A. (2021, January). News information decoupling: An information + signature of catastrophes in legacy news media. https://eadh2020-2021.org/
  • +
  • [5] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daumé III, + and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
  • +
+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/datasheets/daradio/index.html b/datasheets/daradio/index.html new file mode 100644 index 00000000..a4679007 --- /dev/null +++ b/datasheets/daradio/index.html @@ -0,0 +1,815 @@ + + + + + + + + + + + + + + + + + + + + DaRadio - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + +

DaRadio Datasheet

+

Version: 1.0.0

+

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

+

License: Not publicly available.

+
+

DaRadio consists of radio broadcasts from the Danish radio stations DR P1 and Radio24Syv, and contains approximately 140.000 hours of speech. DaRadio includes all shows aired on DR P1 from 2005 to 2021, and all shows aired on Radio24Syv from 2011 to 2019.

+

DaRadio has been deduplicated using a series of heuristics based on metadata. For more on deduplication, see the data cleaning section further below.

+

Datasheet

+

Following the recommendation and framework of [1], we add the following datasheet.

+

Motivation:

+

For what purpose was the dataset created? Who created the dataset? Who funded the creation of the dataset?

+

Data included in DaRadio was collected following the Danish Legal Deposit Act by the Royal Danish Library (RDL). From this, a dataset of Danish speech-only radio was derived by RDL. The dataset was created for research purposes, including training a Danish wav2vec2.0 model.

+

The dataset was preprocessed to remove duplicates by a team of researchers at the Center for Humanities Computing, Aarhus University (CHC) with collaborators from the Danish speech-processing company Alvenir.

+

Composition

+

What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)?

+

Instances of the dataset include an mp3 file for each show aired on the two staions within the period. Further metadata include information on date and time of airing, title, short description of the show, and various internal identifiers used by RDL.

+

How many instances are there in total (of each type, if appropriate)?

+

DaRadio consists of a total of 215.582 hours of unprocessed Danish speech radio shows across two stations, DR P1 and Radio24syv. The table below shows the distribution over the stations with and without heuristic rerun removal.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SourceDuration (hours)Reruns removed
P1145.160False
97.401True
Radio24syv70.422False
44.569True
+

Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set?

+

The dataset contains all shows from the two stations in the time period (2005-2021 for DR P1 and 2011-2019 for Radio24syv).

+

If the dataset is a sample from a larger set, what was the sampling strategy?

+

The dataset is a subset of all Danish radio. The two stations were chosen for the dataset as they are talk-radio only.

+

Who was involved in the data collection process?

+

The RDL collects Danish radio shows and constructed DaRadio for handing to researchers at CHC.

+

Over what timeframe was the data collected?

+

The dataset includes radio shows from the period 2005 to 2021.

+

Were any ethical review processes conducted?

+

The RDL collects radio shows in adherence to Danish Archival laws. DaRadio was constructed for a research project, for which a project proposal was accepted by RDL. No other ethical review processes were conducted.

+

Preprocessing/cleaning/labeling

+

Was any preprocessing/Cleaning/Labeling of the data done +(e.g., discretization or bucketing, tokenization, part-of-speech tagging, +SIFT feature extraction, removal of instances, processing of missing values)?

+

DaRadio has been deduplicated using a series of heuristic filters and all files have been converted to 16 Khz .wav files.

+

Reruns/duplicates were identified by the following rules:

+
    +
  • If the phrase "sendt første gang" ["aired the first time"] or "genudsendelse" ["rerun"] appeared in the show description.
  • +
  • If the title contained "(G)" (short for "genudsendelse"))
  • +
  • If the show was broadcast between 23:00 and 5:00.
  • +
+

The deduplication was coded and conducted by researchers at CHC.

+

Is the software used to preprocess/clean/label the instances available?

+

The scripts are available at the following GitHub repository: link.

+

Uses

+

Has the dataset been used for any tasks already?

+

Yes, the dataset has been used to pre-train a Danish wav2vec2.0 model.

+

Is there a repository that links to any or all papers or systems that use the dataset?

+

No, but as of 23/10/16 no others have used the dataset.

+

What (other) tasks could the dataset be used for?

+

As the dataset only contains un-labelled data, i.e. no transcriptions, it is mainly designed for pre-training language models. However, given the metadata and re-occuring hosts, further processing might make it possible to train e.g. text-to-speech systems.

+

Is there anything about the composition of the dataset or the way it was collected and +preprocessed/cleaned/labeled that might impact future uses?

+

This dataset is static and does not evolve over time with the language, thus will become increasingly outdated over time.

+

Distribution

+

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

+

Data will only be available at the entity during the project. An equivalent or updated dataset can be requested at the Royal Danish Library.

+

Citation

+

If you wish to cite this work please see our GitHub page for an up to date citation: https://github.com/centre-for-humanities-computing/danish-foundation-models

+

References:

+
    +
  • [1] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daumé III, + and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
  • +
+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/datasheets/hopetwitter/index.html b/datasheets/hopetwitter/index.html new file mode 100644 index 00000000..c0894606 --- /dev/null +++ b/datasheets/hopetwitter/index.html @@ -0,0 +1,897 @@ + + + + + + + + + + + + + + + + + + + + + + HopeTwitter - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + +

HopeTwitter

+

Version: 1.0.0

+

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

+

license: Not publicly available.

+
+

HopeTwitter consists of tweets collected from the Twitter API using a stopword list +and consists of 32.5 million tweets across 538,398 unique users. HopeTwitter includes +tweets from 2019-01-01 to 2021-04-30.

+

HopeTwitter, have been filtered to only include Danish tweets, based on language tag from Twitter API. Similarly, HopeTwitter +have had low-quality tweets have removed and then deduplicated to remove exact and +near-duplicates. For more on data cleaning see section; +"Preprocessing/cleaning/labeling".

+

HopeTwitter includes a total of 0.97 billion tokens before filtering and includes 0.48 +billion (50%) after.

+

Datasheet

+

Following the recommendation and framework of [3] we add the following datasheet.

+

Motivation

+

**For what purpose was the dataset created? Who created the dataset? Who funded the +creation of the dataset? **

+

HopeTwitter was initially collected as a part of the +HOPE project, examining societal behaviour during the +COVID-19 pandemic. Next, HopeTwitter was cleaned in preparation for pre-training Danish language +models by a team of researchers at Center for Humanities Computing Aarhus +(CHCAA), using +a codebase jointly developed with partners from academia and industry, including KMD, +Ekstra Bladet, Bristol University and Deepdivr. For more on collaborators on this +project see the +GitHub repository.

+

Any other comments?

+

No.

+

Composition

+

What do the instances that comprise the dataset represent (e.g., documents, photos, +people, countries)?

+

HopeTwitter consists of tweets containing at least one of a series of stopwords, +collected through the Twitter API. See "If the dataset is a sample from a larger set, +what was the sampling strategy?" for the stopword list.

+

How many instances are there in total (of each type, if appropriate)?

+

The dataset consist of 32,499,019 documents where 14,399,284 (44%) were considered +duplicates.

+

Does the dataset contain all possible instances or is it a sample (not necessarily +random) of instances from a larger set?

+

No. It does not contain all instances of Danish Twitter as there are likely some Danish +tweets which does not include a stopword.

+

Is there a label or target associated with each instance? If so, please provide a +description.

+

No.

+

Are there recommended data splits (e.g., training, development/validation, testing)? +If so, please provide a description of these splits, explaining the rationale behind +them.

+

No splits are performed on this dataset.

+

If the dataset is a sample from a larger set, what was the sampling strategy?

+

Tweets are streamed continuously using queried a set of the highest +frequency Scandinavian-specific keywords from Danish, Norwegian (Bokmål) and Swedish, +resulting in the following list: +

aften, aldrig, alltid, altid, andet, arbejde, bedste, behöver, behøver, beklager,
+berätta, betyr, blev, blevet, blir, blitt, blive, bliver, bruge, burde, bättre, båe
+bør, deim, deires, ditt, drar, drepe, dykk, dykkar, där, död, döda, død, døde, efter,
+elsker, endnu, faen, fandt, feil, fikk, finner, flere, forstår, fortelle, fortfarande,
+fortsatt, fortælle, från, få, fået, får, fått, förlåt, första, försöker, før, først,
+første, gick, gikk, gillar, gjennom, gjerne, gjorde, gjort, gjør, gjøre, godt, gå, gång,
+går, göra, gør, gøre, hadde, hallå, havde, hedder, helt, helvete, hende, hendes, hennes,
+herregud, hjelp, hjelpe, hjem, hjälp, hjå, hjælp, hjælpe, honom, hossen, hvem, hvis,
+hvordan, hvorfor, händer, här, håll, håller, hør, høre, hører, igjen, ikkje, ingenting,
+inkje, inte, intet, jeres, jävla, kanske, kanskje, kender, kjenner, korleis, kvarhelst,
+kveld, kven, kvifor, känner, ledsen, lenger, lidt, livet, längre, låt, låter, længe,
+meget, menar, mycket, mykje, må, måde, många, mår, måske, måste, måtte, navn, nogen,
+noget, nogle, noko, nokon, nokor, nokre, någon, något, några, nån, når, nåt, nødt,
+också, også, pengar, penger, pratar, prøver, på, redan, rundt, rätt, sagde, saker,
+samma, sammen, selv, selvfølgelig, sidan, sidste, siger, sikker, sikkert, själv, skete,
+skjedde, skjer, skulle, sluta, slutt, snakke, snakker, snill, snälla, somt, stadig,
+stanna, sted, står, synes, säger, sätt, så, sådan, såg, sånn, tager, tiden, tilbage,
+tilbake, tillbaka, titta, trenger, trodde, troede, tror, två, tycker, tänker, uden,
+undskyld, unnskyld, ursäkta, uten, varför, varit, varte, veldig, venner, verkligen,
+vidste, vilken, virkelig, visste, väg, väl, väldigt, vän, vår, våra, våre, væk, vær, 
+være, været, älskar, åh, år, åt, över
+

+

Who was involved in the data collection process?

+

A team of researchers at the Center for Humanities +Computing Aarhus (CHCAA), including Kristoffer Nielbo and Peter Bjerregaard Vahlstrup, in collaboration with Rebekah Baglini, at the School of Communcation and Culture at Aarhus university.

+

Over what timeframe was the data collected?

+

The dataset includes tweets from the period 2019-01-01 to 2021-04-30.

+

Were any ethical review processes conducted?

+

No

+

Preprocessing/cleaning/labeling

+

Was any preprocessing/Cleaning/Labeling of the data done +(e.g., discretization or bucketing, tokenization, part-of-speech tagging, +SIFT feature extraction, removal of instances, processing of missing values)?

+

Firstly, HopeTwitter had non-Danish tweets removed, after which a series of +heuristic filters were applied, including the removal of repetitious texts. Following the filtering, +HopeTwitter was deduplicated, removing both exact duplicates and near-duplicates.

+

Of all documents, 3,023,427 (9%) were filtered due to low-quality and +14,399,284 (33%) because they were near-duplicates.

+

For the quality filtering, HopeTwitter applies a filter akin to [2] which contains text +that:

+
    +
  • Contain at least 2 Danish stopwords. For the stopword list we use the one used in +SpaCy v.3.1.4.
  • +
  • Have a mean word length between 2 and 14.
  • +
  • Have a token length between 10 and 100,000.
  • +
  • Have less than 5,000,000 characters.
  • +
  • +

    Have less than 60% of words containing an alphabetic character.

    +
  • +
  • +

    Have low high degree of repetitious text:

    +
  • +
  • Have less than 20% of characters contained within duplicate lines.
  • +
  • Have less than 20% of characters contained within duplicate paragraphs.
  • +
  • Where the top 2-4 grams constitute less than 20%, 18%, 16%, respectively, of the text.
  • +
  • Where the duplicate 5-10 grams constitute less than 25%, 24%, 23%, 22%, 21%, 20% +of the text, respectively.
  • +
+

The deduplication removed all documents with a 10-gram Jaccard similarity higher than 80% +following the MinHash algorithm [1] using 128 permutations. The MinHash algorithm is a +probabilistic data structure for approximating the Jaccard similarity between two sets.

+

Is the software used to preprocess/clean/label the instances available?

+

Yes, the scripts are available +here. +The scripts use version 0.0.2 of the +dfm package.

+

Uses

+

Has the dataset been used for any tasks already?

+

Yes, the dataset has been used to pre-train Danish language models. +Parts of the dataset have also been used in HOPE project reports +and in [4].

+

Is there a repository that links to any or all papers or systems that use the dataset?

+

There is a website for the HOPE project for which the dataset was initially collected. This website contains report and articles regarding the dataset.

+

What (other) tasks could the dataset be used for?

+

The scale of the dataset makes it suitable for NLP tasks such as language modelling. +Similarly, one could imagine using the conversation structure could be used to train +conversational chatbots.

+

Is there anything about the composition of the dataset or the way it was collected and +preprocessed/cleaned/labeled that might impact future uses?

+

This dataset is static and thus does not evolve over time with the language. +A consequence of this is that it will become increasingly outdated over time. However, +it possible to extend the dataset by a continual collection of tweets.

+

Are there tasks for which the dataset should not be used?

+

HopeTwitter contains Danish tweets and thus should not be used for non-Danish language tasks.

+

As the writers of the content is predominantly journalists, politicians, influencers, +and academics, it reflects a certain social group which is unlikely to reflect Danish +population as a whole.

+

Distribution

+

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

+

Data will only be available at the entity during the project. After the project the data will be archived for a period of five years to comply with the university [policy] for research integrity. After the five years, the data will be registered at the national archives as required by executive order 514 for potential long-term deposit.

+

Citation

+

If you wish to cite this work please see our GitHub page for an up to date citation: +https://github.com/centre-for-humanities-computing/danish-foundation-models

+

References:

+
    +
  • [1] Broder, Andrei Z. "On the resemblance and containment of documents." + Proceedings. Compression and Complexity of SEQUENCES 1997 + (Cat. No. 97TB100171). IEEE, 1997.
  • +
  • [2] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., + Aslanides, J., Henderson, S., Ring, R., Young, S., Rutherford, E., Hennigan, + T., Menick, J., Cassirer, A., Powell, R., Driessche, G. van den, Hendricks, + L. A., Rauh, M., Huang, P.-S., … Irving, G. (2021). + Scaling Language Models: Methods, Analysis & Insights from Training Gopher. + https://arxiv.org/abs/2112.11446v2
  • +
  • [3] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daumé III, + and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
  • +
  • [4] Johansen, N., Marjanovic, S. V., Kjaer, C. V., Baglini, R. B., & Adler-Nissen, R. + (2022). Ridiculing the “tinfoil hats:” Citizen responses to COVID-19 misinformation + in the Danish facemask debate on Twitter. Harvard Kennedy School Misinformation + Review. https://doi.org/10.37016/mr-2020-93
  • +
+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/datasheets/netarkivet_text/index.html b/datasheets/netarkivet_text/index.html new file mode 100644 index 00000000..50d7e607 --- /dev/null +++ b/datasheets/netarkivet_text/index.html @@ -0,0 +1,946 @@ + + + + + + + + + + + + + + + + + + + + + + NAT - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + +

NAT: Netarkivet Text

+

Version: 1.0.0

+

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

+

license: Not publicly available.

+
+

Netarkivet Text (NAT) consists of a subsection of Netarkivet and +contains 2,332 million sites across 1.6 million domains. +Netarkivet includes sites from the period 2006 to 2016.

+

NAT has been filtered using a series of heuristic filters and removing repetitious texts. +Following the filtering, NAT is further deduplicated to remove exact and near-duplicates. For more on data cleaning, +see the post processing section below.

+

The sites which passed the quality filter were deduplicated per year. NAT consist of 865 billion tokens of which 134 (15%) billion were left after filtering and deduplication.

+

Datasheet

+

Following the recommendation and framework of [3], we add the following datasheet.

+

Motivation:

+

For what purpose was the dataset created? Who created the dataset? Who funded the creation of the dataset?

+

Netarkivet was created following the Danish Legal Deposit Act, +from which a text-only corpus was derived for research purposes, see [4,5]. This is the +part from which this dataset is derived. +This part has then been filtered with the intention of training Danish language

+

models by a team of researchers at the Center for Humanities Computing Aarhus (CHCAA) using +a codebase jointly developed with partners from industry (e.g. KMD, Ekstra Bladet) and +other research institutions (e.g. Bristol University, Alexandra Institute). +For more on collaborators on this project see the GitHub repository.

+

Any other comments?

+

No.

+

Composition

+

What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)?

+

Instances of the dataset are Danish domain sites, which further include metadata such as:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ColumnDtype
0harvest_idint32
1job_idint32
2sha1object
3mime_servedobject
4languageobject
5mime_droidobject
6timestampobject
7uriobject
9domain_keyobject
+

Where harvest_id is the id of the associated Netarkivet web harvest. Each web harvest +consists of jobs, each with their associated job-id.

+

Language is the language classified using the following language detection library. uri is the URI of the site e.g. "http://www.apple.com/podcasting". +timestamp is the date given in the format "20060612105533", indicating year, month, date, and time. +The sha1 is the website hash. +mime_* indicates the mime/media type. +mime_served could for instance be "text/html; charset=iso-8859-1" and mime_droid could be "text/html; version=2.0" and is the mime type identified by the server and by DROID, respectively. +How many instances are there in total (of each type, if appropriate)?

+

NAT contains a total of 2,332 million sites distributed over 1.6 million domains.
+1,370 million of these sites are Danish, with the largest secondary language being English +with 718 million sites.

+

Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set?

+

These domains are a subset of Netarkivet, which again is a sample of all the Danish content on the internet.

+

If the dataset is a sample from a larger set, what was the sampling strategy?

+

Netarkivet has been scraped from the internet using the following procedures:

+
    +
  • Cross-sectional collection of all Danish websites up to four times a year.
  • +
  • Selective collection of the following domains: All Danish newsmedia (with a frequency +ranging from 12 times a day to weekly), political parties, organizations and unions, +legal bodies such as ministries and agencies, selected social media profiles, and YouTube videos.
  • +
  • Event collections of 2-3 events yearly (e.g. elections and the COVID pandemic)
  • +
  • Miscellaneous/on-demand web scrapes (for instance in collaboration with researchers)
  • +
+

A selective subset of Netarkivet is then extracted per year from 2006 to 2016 such that +it contains no duplicate sites. Apache Tika (v. 1.15) is then used to extract the text from the sites. +During extraction, all HTML markup is removed, along with javascript and CSS code. +The news media of textual HTML elements, such as <P> and <H1> are concatenated into one piece of text.

+

Who was involved in the data collection process?

+

The Royal Danish Library collects Netarkivet along with Brügger et al. [4,5] +helped with the construction of NAT.

+

Over what timeframe was the data collected?

+

The dataset includes articles from the period 2006 to 2016.

+

Were any ethical review processes conducted?

+

Netarkivet in collected in adherence to an update to the Danish archival law in 2005, +which extended the law to also include internet domains.

+

Our text subset was constructed for a research project and thus a project proposal +has been accepted by the Royal Danish Library. Besides these, the author is not aware of +any ethical approvals.

+

Preprocessing/cleaning/labeling

+

Was any preprocessing/Cleaning/Labeling of the data done +(e.g., discretization or bucketing, tokenization, part-of-speech tagging, +SIFT feature extraction, removal of instances, processing of missing values)?

+

NAT has been filtered using a series of heuristic filters as well as removing +repetitious texts. Following the filtering, the corpus was deduplicated to remove exact and +near-duplicates.

+

For quality filtering, NAT applies a filter akin to [2] which contains text +that:

+
    +
  • Contain at least 2 Danish stopwords. For the stopword list, we use the one used in +SpaCy v.3.1.4.
  • +
  • Have a mean word length between 3 and 10.
  • +
  • Have a token length between 50 and 100,000.
  • +
  • Have less than 5,000,000 characters.
  • +
  • Have less than 60% of words containing an alphabetic character.
  • +
  • Have a symbol-to-word ratio lower than 10% for hashtags and ellipsis.
  • +
  • Have less than 90% of lines starting with a bullet point.
  • +
  • +

    have less than 30% of lines ending with an ellipsis.

    +
  • +
  • +

    Have a low high degree of repetitious text:

    +
  • +
  • Have less than 20% of characters contained within duplicate lines.
  • +
  • Have less than 20% of characters contained within duplicate paragraphs.
  • +
  • Where the top 2-4 grams constitute less than 20%, 18%, 16%, respectively, of the text.
  • +
  • Where the duplicate 5-10 grams constitute less than 25%, 24%, 23%, 22%, 21%, 20% +of the text, respectively.
  • +
+

The deduplication removed all documents with a 13-gram Jaccard similarity higher than 80% +following the MinHash algorithm [1] using 128 permutations. The MinHash algorithm is a +probabilistic data structure for approximating the Jaccard similarity between two sets.

+

Is the software used to preprocess/clean/label the instances available?

+

Yes, the scripts are available +here. +the scripts use version 0.0.2 of the +dfm package.

+

Uses

+

Has the dataset been used for any tasks already?

+

Yes, the dataset has been used to pre-train Danish language models. +Furthermore, the unfiltered dataset has also been used in [4] and [5], for examining +the development of the Danish web.

+

Is there a repository that links to any or all papers or systems that use the dataset?

+

No.

+

What (other) tasks could the dataset be used for?

+

The scale of the dataset makes it suitable for NLP tasks such as language modelling. +It is likely possible to extract reviews, social media posts and similar semi-labelled +datasets from the dataset which can be used for NLP task such as sentiment analysis or +hate-speech detection.

+

The content of dataset makes it useable in a wide range of other applications in media +studies, social science or humanities, including development of written Danish, +emerging conspiracy theories, and online information dynamics.

+

Is there anything about the composition of the dataset or the way it was collected and +preprocessed/cleaned/labeled that might impact future uses?

+

This dataset is static and thus does not evolve over time with the language, thus will +become increasingly outdated over time. Netarkivet, from which it is derived, is +not static however, and is thus likely to further develop, which will allow us to update the +dataset going forward.

+

Distribution

+

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

+

Data will only be available at the entity during the project. An equivalent or updated dataset can be requested at the Royal Danish Library.

+

Citation

+

If you wish to cite this work please see our GitHub page for an up to date citation: https://github.com/centre-for-humanities-computing/danish-foundation-models

+

References:

+
    +
  • [1] Broder, Andrei Z. "On the resemblance and containment of documents." + Proceedings. Compression and Complexity of SEQUENCES 1997 + (Cat. No. 97TB100171). IEEE, 1997.
  • +
  • [2] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., + Aslanides, J., Henderson, S., Ring, R., Young, S., Rutherford, E., Hennigan, + T., Menick, J., Cassirer, A., Powell, R., Driessche, G. van den, Hendricks, + L. A., Rauh, M., Huang, P.-S., … Irving, G. (2021). + Scaling Language Models: Methods, Analysis & Insights from Training Gopher. + https://arxiv.org/abs/2112.11446v2
  • +
  • [3] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daumé III, + and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
  • +
  • [4] Brügger, N., Nielsen, J., & Laursen, D. (2020). Big data experiments with the + archived Web: Methodological reflections on studying the development of a + nation’s Web. First Monday. https://doi.org/10.5210/fm.v25i3.10384
  • +
  • [5] Brügger, N. (2021). Digital humanities and web archives: Possible new paths for + combining datasets. International Journal of Digital Humanities, 2(1), 145–168. + https://doi.org/10.1007/s42803-021-00038-z
  • +
+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/dcc/index.html b/dcc/index.html new file mode 100644 index 00000000..3752005a --- /dev/null +++ b/dcc/index.html @@ -0,0 +1,647 @@ + + + + + + + + + + + + + + + + + + + + + + DCC - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + +

DCC v1

+

The DCC is a composite corpus consisting of the following subcorpora.

+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/desc_stats/netarkivet_sites_collected_pr_month.png b/desc_stats/netarkivet_sites_collected_pr_month.png new file mode 100644 index 00000000..352beec6 Binary files /dev/null and b/desc_stats/netarkivet_sites_collected_pr_month.png differ diff --git a/desc_stats/netarkivet_top_50_domains.png b/desc_stats/netarkivet_top_50_domains.png new file mode 100644 index 00000000..5c1ea35e Binary files /dev/null and b/desc_stats/netarkivet_top_50_domains.png differ diff --git a/index.html b/index.html new file mode 100644 index 00000000..331099b2 --- /dev/null +++ b/index.html @@ -0,0 +1,622 @@ + + + + + + + + + + + + + + + + + + + + Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + +

About

+ +

This website is under construction 🛠️

+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/intercoder_reliability/index.html b/intercoder_reliability/index.html new file mode 100644 index 00000000..59aad736 --- /dev/null +++ b/intercoder_reliability/index.html @@ -0,0 +1,701 @@ + + + + + + + + + + + + + + + + + + Results from corpus tagging - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + +

Results from corpus tagging

+

Each user tagged 100 documents unless otherwise specified. Documents were split by newlines into text-blocks, block was rated. +Text-blocks longer than 1000 characters were split into multiple blocks of 1000 characters or less.

+

This tagging scheme is similar to +(Kreutzer et al., 2022).

+

Each block was put into one of the following categories: +Each user tagged 100 documents (unless otherwise specified). Each document were tagged

+
    +
  • wrong_language: Not Danish
  • +
  • skipped: Unsure of category
  • +
  • correct_language: Danish text where at least 80% of the text is reasonable.
  • +
  • not_language: Text where less than 80% of the text is reasonable. Takes priority over wrong_language.
  • +
+

Additionally, each block was tagged for pornography (yes/no) and offensiveness (yes/no).

+

Text proportions

+
+

Kenneth (Session: test)

+
    +
  • Date: 2022-09-05
  • +
  • Sentences tagged: 102
  • +
  • Documents tagged: na
  • +
+

Proportions:

+
    +
  • 69.16% of characters is correct_language
  • +
  • 25.66% of characters is not_language
  • +
  • 2.74% of characters is skipped
  • +
  • 2.45% of characters is wrong_language
  • +
  • 0.00% of characters is porn
  • +
  • 0.00% of characters is offensive
  • +
+

Kenneth (Session: 1)

+
    +
  • Date: 2022-09-06
  • +
  • Sentences tagged: 292
  • +
  • Documents tagged: 100
  • +
+

Proportions:

+
    +
  • 68.03% of characters is correct_language
  • +
  • 29.19% of characters is not_language
  • +
  • 2.10% of characters is skipped
  • +
  • 0.68% of characters is wrong_language
  • +
  • 0.00% of characters is porn
  • +
  • 1.38% of characters is offensive
  • +
+

Lasse (Session: 1)

+
    +
  • Date: 2022-09-07
  • +
  • Sentences tagged: 336
  • +
  • Documents tagged: 100
  • +
+

Proportions:

+
    +
  • 68.02% of characters is correct_language
  • +
  • 30.97% of characters is not_language
  • +
  • 1.01% of characters is wrong_language
  • +
  • 0.26% of characters is porn
  • +
  • 0.00% of characters is offensive
  • +
+

Intercoder Reliability

+
+

Kenneth (Session: test) vs Kenneth - (Session: 1)

+
    +
  • +

    Cohen's Kappa (all categories): 0.8242 (Overlap in sentences: 98)

    +
  • +
  • +

    Cohen's Kappa (correct_language vs not correct_language): 0.9075 (Overlap in sentences: 98)

    +
  • +
+

Kenneth (Session: test) vs Lasse - (Session: 1)

+
    +
  • +

    Cohen's Kappa (all categories): 0.8140 (Overlap in sentences: 95)

    +
  • +
  • +

    Cohen's Kappa (correct_language vs not correct_language): 0.8389 (Overlap in sentences: 95)

    +
  • +
+

Kenneth (Session: 1) vs Lasse - (Session: 1)

+
    +
  • +

    Cohen's Kappa (all categories): 0.6767 (Overlap in sentences: 245)

    +
  • +
  • +

    Cohen's Kappa (correct_language vs not correct_language): 0.7259 (Overlap in sentences: 245)

    +
  • +
+

Comparison with mC4

+
+

Note: mC4 did have a high degree of repititious texts. Similarly it did when texts blocks where not language they were often something like:

+
2lineStart%22%3A%22%22%2C%22placeholder%22%3A1%2C%22extName%22%3A%22nowiki%22%7D"" class=""placeholder placeholder-ext"" contenteditable=""false"">]&#x200b;</span></a></sup>&#x200b;</span>, at en lurifaks som Jimmy page, bruger MIT navn til opfindelsen! SV<span data-rte-instance=""1524-12953202845f3523698f3f1"" data-rte-meta=""%7B%22type%22%3A%22ext%22%2C%22wikitext%22%3A%22%3Cref%3ESVIN%3C%5C%2Fref%3E%22%2C%22lineStart%22%3A%22%22%2C%22placeholder%22%3A1%2C%22extName%22%3A%22ref%22%7D"" class=""placeholder placeholder-ext"" contenteditable=""false""><sup data-rte-washtml=""1"" id=""cite_ref-2"" class=""reference"" data-rte-attribs=""
+
+

While non-language texts in NAT was often menu bars, contact information, or navigation.

+

Kenneth (Session: 1)

+
    +
  • Date: 2022-09-06
  • +
  • Sentences tagged: 325
  • +
  • Documents tagged: 100
  • +
+

Proportions:

+
    +
  • 62.47% of characters is correct_language
  • +
  • 34.88% of characters is not_language
  • +
  • 1.27% of characters is skipped
  • +
  • 1.38% of characters is wrong_language
  • +
  • 3.25% of characters is porn
  • +
  • 0.00% of characters is offensive
  • +
+ + + + + + +
+
+ + + + +
+ + + +
+ +
+ + + + +
+ +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/models_speech/index.html b/models_speech/index.html new file mode 100644 index 00000000..1e65e08f --- /dev/null +++ b/models_speech/index.html @@ -0,0 +1,663 @@ + + + + + + + + + + + + + + + + + + + + + + Speech - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+ +
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/models_text/index.html b/models_text/index.html new file mode 100644 index 00000000..14589384 --- /dev/null +++ b/models_text/index.html @@ -0,0 +1,667 @@ + + + + + + + + + + + + + + + + + + + + + + Text - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+ +
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 00000000..af877e75 --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"About","text":"

This website is under construction \ud83d\udee0\ufe0f

"},{"location":"dcc/","title":"DCC v1","text":"

The DCC is a composite corpus consisting of the following subcorpora.

"},{"location":"intercoder_reliability/","title":"Results from corpus tagging","text":"

Each user tagged 100 documents unless otherwise specified. Documents were split by newlines into text-blocks, block was rated. Text-blocks longer than 1000 characters were split into multiple blocks of 1000 characters or less.

This tagging scheme is similar to (Kreutzer et al., 2022).

Each block was put into one of the following categories: Each user tagged 100 documents (unless otherwise specified). Each document were tagged

  • wrong_language: Not Danish
  • skipped: Unsure of category
  • correct_language: Danish text where at least 80% of the text is reasonable.
  • not_language: Text where less than 80% of the text is reasonable. Takes priority over wrong_language.

Additionally, each block was tagged for pornography (yes/no) and offensiveness (yes/no).

"},{"location":"intercoder_reliability/#text-proportions","title":"Text proportions","text":"

Kenneth (Session: test)

  • Date: 2022-09-05
  • Sentences tagged: 102
  • Documents tagged: na

Proportions:

  • 69.16% of characters is correct_language
  • 25.66% of characters is not_language
  • 2.74% of characters is skipped
  • 2.45% of characters is wrong_language
  • 0.00% of characters is porn
  • 0.00% of characters is offensive

Kenneth (Session: 1)

  • Date: 2022-09-06
  • Sentences tagged: 292
  • Documents tagged: 100

Proportions:

  • 68.03% of characters is correct_language
  • 29.19% of characters is not_language
  • 2.10% of characters is skipped
  • 0.68% of characters is wrong_language
  • 0.00% of characters is porn
  • 1.38% of characters is offensive

Lasse (Session: 1)

  • Date: 2022-09-07
  • Sentences tagged: 336
  • Documents tagged: 100

Proportions:

  • 68.02% of characters is correct_language
  • 30.97% of characters is not_language
  • 1.01% of characters is wrong_language
  • 0.26% of characters is porn
  • 0.00% of characters is offensive
"},{"location":"intercoder_reliability/#intercoder-reliability","title":"Intercoder Reliability","text":"

Kenneth (Session: test) vs Kenneth - (Session: 1)

  • Cohen's Kappa (all categories): 0.8242 (Overlap in sentences: 98)

  • Cohen's Kappa (correct_language vs not correct_language): 0.9075 (Overlap in sentences: 98)

Kenneth (Session: test) vs Lasse - (Session: 1)

  • Cohen's Kappa (all categories): 0.8140 (Overlap in sentences: 95)

  • Cohen's Kappa (correct_language vs not correct_language): 0.8389 (Overlap in sentences: 95)

Kenneth (Session: 1) vs Lasse - (Session: 1)

  • Cohen's Kappa (all categories): 0.6767 (Overlap in sentences: 245)

  • Cohen's Kappa (correct_language vs not correct_language): 0.7259 (Overlap in sentences: 245)

Comparison with mC4

Note: mC4 did have a high degree of repititious texts. Similarly it did when texts blocks where not language they were often something like:

2lineStart%22%3A%22%22%2C%22placeholder%22%3A1%2C%22extName%22%3A%22nowiki%22%7D\"\" class=\"\"placeholder placeholder-ext\"\" contenteditable=\"\"false\"\">]&#x200b;</span></a></sup>&#x200b;</span>, at en lurifaks som Jimmy page, bruger MIT navn til opfindelsen! SV<span data-rte-instance=\"\"1524-12953202845f3523698f3f1\"\" data-rte-meta=\"\"%7B%22type%22%3A%22ext%22%2C%22wikitext%22%3A%22%3Cref%3ESVIN%3C%5C%2Fref%3E%22%2C%22lineStart%22%3A%22%22%2C%22placeholder%22%3A1%2C%22extName%22%3A%22ref%22%7D\"\" class=\"\"placeholder placeholder-ext\"\" contenteditable=\"\"false\"\"><sup data-rte-washtml=\"\"1\"\" id=\"\"cite_ref-2\"\" class=\"\"reference\"\" data-rte-attribs=\"\"\n

While non-language texts in NAT was often menu bars, contact information, or navigation.

Kenneth (Session: 1)

  • Date: 2022-09-06
  • Sentences tagged: 325
  • Documents tagged: 100

Proportions:

  • 62.47% of characters is correct_language
  • 34.88% of characters is not_language
  • 1.27% of characters is skipped
  • 1.38% of characters is wrong_language
  • 3.25% of characters is porn
  • 0.00% of characters is offensive
"},{"location":"models_speech/","title":"Speech","text":"

This section contain references to models trained on speech

Model Model type xls-r-300m-danish Pretrained wav2vec2.0 model xls-r-300m-danish-nst-cv9 Automatic speech recognition chcaa/xls-r-300m-nst-cv9-da Automatic speech recognition"},{"location":"models_text/","title":"Text","text":"

This section contain references to models trained on text

Model Model type Size (parameters) dfm-encoder-large-v1 Encoder large (355M) dfm-encoder-medium-v1 Encoder medium (110M) dfm-encoder-small-v1 Encoder small (22M)"},{"location":"datasheets/danews/","title":"DaNews","text":"

Version: 1.0.0

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

license: Not publicly available.

DaNews consists of articles from Danish news and tabloid media from 1 December 2019 to 30 April 2021. The articles stem from multiple news sources, including both online of physical newspapers.

"},{"location":"datasheets/danews/#datasheet","title":"Datasheet","text":"

Following the recommendation and framework of [5] we add the following datasheet.

"},{"location":"datasheets/danews/#motivation","title":"Motivation","text":"

For what purpose was the dataset created? Who created the dataset? Who funded the creation of the dataset?

DANews was collected as a part of the HOPE project, examining news coverage during the COVID-19 pandemic. The purpose was to train a model to understand how the novelty and resonance imprint of COVID-19 as a case of crisis compared to non-crises news imprints.

Any other comments?

No.

"},{"location":"datasheets/danews/#composition","title":"Composition","text":"

What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)?

Instances of the dataset are Danish articles derived from Danish tabloids or news media.

Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set?

Prior to filtering DaNews dataset contains all digitized news articles from the given period across the sources.

What data does each instance consist of? \u201cRaw\u201d data (e.g., unprocessed text or images) or features? In either case, please provide a description.

Each instance consists of the following columns

'ArticleUrl', 'Heading', 'SubHeading', 'Lead', 'Paragraph', 'PublishDate', 'BodyText', \n'Captions', 'Authors', 'Source', 'WordCount', 'ArticleId', 'PageIds', 'Section', 'text'\n

Where we constructed the columns text column by joining the Heading, SubHeading using newline. If the text field is empty it is ignored and no newline is added. The we join the resulting string with the BodyText using two newlines.

During the quality filtering, we add the following indicator columns:

'passed_quality_filter', 'filtered_by_max_chr_length', 'filtered_by_doc_length', \n'filtered_by_mean_word_length', 'filtered_by_alpha_ratio', 'filtered_by_stop_word', \n'filtered_by_symbol_2_word_hashtag', 'filtered_by_symbol_2_word_ellipsis',\n'filtered_by_line_bullets_or_ellipsis', 'filtered_by_duplicate_lines_chr_fraction',\n'filtered_by_duplicate_paragraph_chr_fraction', 'filtered_by_top_ngram_chr_fraction',\n'filtered_by_duplicate_ngram_chr_fraction', 'is_duplicate'\n

Is there a label or target associated with each instance? If so, please provide a description.

No.

Is any information missing from individual instances? If so, please provide a description, explaining why this information is missing (e.g., because it was unavailable). This does not include intentionally removed information but might include, e.g., redacted text.

The team of researchers at the Humanities Computing Aarhus (CHCAA) have not removed any information from the instances.

Are relationships between individual instances made explicit (e.g., users\u2019 movie ratings, and social network links)? If so, please describe how these relationships are made explicit.

The metadata columns denote the relationship between articles including the date of publication, sections, and authors.

Are there recommended data splits (e.g., training, development/validation, testing)? If so, please provide a description of these splits, explaining the rationale behind them.

There are not splits performed on this dataset.

Are there any errors, sources of noise, or redundancies in the dataset? If so, please provide a description.

News sources can publish their content both in an online and printed format which would lead to similar instances in the dataset. To alleviate this redundancy by removing near-duplicates (see Preprocessing/cleaning/labeling).

Is the dataset self-contained, or does it link to or otherwise rely on external resources (e.g., websites, tweets, other datasets)?

Articles are intended to tell a self-contained story but can include external references such as tweets or website URLs.

Does the dataset contain data that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety?

Articles often describe content that is considered offensive, insulting, or threatening.

"},{"location":"datasheets/danews/#collection-process","title":"Collection Process","text":"

If the dataset is a sample from a larger set, what was the sampling strategy?

The dataset is not a sample, but is a filtered version of the full dataset, see Preprocessing/cleaning/labeling for more on this.

Over what timeframe was the data collected?

The dataset includes articles from 1 December 2019 to 30 April 2021.

Were any ethical review processes conducted?

No.

"},{"location":"datasheets/danews/#preprocessingcleaninglabeling","title":"Preprocessing/cleaning/labeling","text":"

Was any preprocessing/Cleaning/Labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)?

DaNews has been filtered using a series of heuristic filters as well as removing repetitious texts. Following the filtering, DaNews is deduplicated to remove exact and near-duplicates.

Of all documents, 9% were filtered based due to low-quality and 4% because they were near-duplicates.

For quality filtering, DaNews applies a filter akin to [2] which contains text that:

  • Contain at least 2 Danish stopwords. For the stopword list we use the one used in SpaCy v.3.1.4.
  • Have a mean word length between 3 and 10.
  • Have a token length between 50 and 100,000.
  • Have less than 5,000,000 characters.
  • Have less than 60% of words containing an alphabetic character.
  • Have a symbol-to-word ratio lower than 10% for hashtags and ellipsis.
  • Have less than 90% of lines starting with a bullet point.
  • have less than 30% of lines ending with an ellipsis.

  • Have a low high degree of repetitious text:

  • Have less than 20% of characters contained within duplicate lines.
  • Have less than 20% of characters contained within duplicate paragraphs.
  • Where the top 2-4 grams constitute less than 20%, 18%, 16%, respectively, of the text.
  • Where the duplicate 5-10 grams constitute less than 25%, 24%, 23%, 22%, 21%, 20% of the text, respectively.

The deduplication removed all documents with a 13-gram Jaccard similarity higher than 80% following the MinHash algorithm [1] using 128 permutations. The MinHash algorithm is a probabilistic data structure for approximating the Jaccard similarity between two sets.

Is the software used to preprocess/clean/label the instances available?

Yes, the scripts are available here. the scripts use version 0.0.2 of the dfm package.

"},{"location":"datasheets/danews/#uses","title":"Uses","text":"

Has the dataset been used for any tasks already?

Yes, the dataset has been used to pre-train Danish language models. Parts of the dataset have also been used in [3] and [4]

Is there a repository that links to any or all papers or systems that use the dataset?

No.

What (other) tasks could the dataset be used for?

The scale of the dataset makes it suitable for NLP tasks such as language modeling. Similarly, the structure of the articles makes it a suitable dataset for training text summarisation models.

Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses?

This dataset is static and thus does not evolve over time with the language. A consequence of this is that it will become increasingly outdated over time.

Are there tasks for which the dataset should not be used?

This dataset contains Danish articles and thus should not be used for non-Danish language tasks.

As the writers of the content are predominantly journalists, it reflects a certain writing style which is unlikely to reflect the Danish language as a whole.

"},{"location":"datasheets/danews/#distribution","title":"Distribution","text":"

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

Data will only be available at the entity during the project. If you wish access to the dataset you will have to come to an agreement with the individuals Danish newspapers.

"},{"location":"datasheets/danews/#citation","title":"Citation","text":"

If you wish to cite this work please see our GitHub page for an up-to-date citation: https://github.com/centre-for-humanities-computing/danish-foundation-models

"},{"location":"datasheets/danews/#references","title":"References:","text":"
  • [1] Broder, Andrei Z. \"On the resemblance and containment of documents.\" Proceedings. Compression and Complexity of SEQUENCES 1997 (Cat. No. 97TB100171). IEEE, 1997.
  • [2] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., Aslanides, J., Henderson, S., Ring, R., Young, S., Rutherford, E., Hennigan, T., Menick, J., Cassirer, A., Powell, R., Driessche, G. van den, Hendricks, L. A., Rauh, M., Huang, P.-S., \u2026 Irving, G. (2021). Scaling Language Models: Methods, Analysis & Insights from Training Gopher. https://arxiv.org/abs/2112.11446v2
  • [3] Baglini, R. B., Nielbo, K. L., H\u00e6strup, F., Enevoldsen, K., Vahlstrup, P. B., & Roepstorff, A. (2021, June 2). When no news is bad news: Detection of negative events from news media content. https://2021.dhbenelux.org/
  • [4] Nielbo, K. L., Baglini, R. B., Vahlstrup, P. B., Enevoldsen, K. C., Bechmann, A., & Roepstorff, A. (2021, January). News information decoupling: An information signature of catastrophes in legacy news media. https://eadh2020-2021.org/
  • [5] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daum\u00e9 III, and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
"},{"location":"datasheets/daradio/","title":"DaRadio Datasheet","text":"

Version: 1.0.0

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

License: Not publicly available.

DaRadio consists of radio broadcasts from the Danish radio stations DR P1 and Radio24Syv, and contains approximately 140.000 hours of speech. DaRadio includes all shows aired on DR P1 from 2005 to 2021, and all shows aired on Radio24Syv from 2011 to 2019.

DaRadio has been deduplicated using a series of heuristics based on metadata. For more on deduplication, see the data cleaning section further below.

"},{"location":"datasheets/daradio/#datasheet","title":"Datasheet","text":"

Following the recommendation and framework of [1], we add the following datasheet.

"},{"location":"datasheets/daradio/#motivation","title":"Motivation:","text":"

For what purpose was the dataset created? Who created the dataset? Who funded the creation of the dataset?

Data included in DaRadio was collected following the Danish Legal Deposit Act by the Royal Danish Library (RDL). From this, a dataset of Danish speech-only radio was derived by RDL. The dataset was created for research purposes, including training a Danish wav2vec2.0 model.

The dataset was preprocessed to remove duplicates by a team of researchers at the Center for Humanities Computing, Aarhus University (CHC) with collaborators from the Danish speech-processing company Alvenir.

"},{"location":"datasheets/daradio/#composition","title":"Composition","text":"

What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)?

Instances of the dataset include an mp3 file for each show aired on the two staions within the period. Further metadata include information on date and time of airing, title, short description of the show, and various internal identifiers used by RDL.

How many instances are there in total (of each type, if appropriate)?

DaRadio consists of a total of 215.582 hours of unprocessed Danish speech radio shows across two stations, DR P1 and Radio24syv. The table below shows the distribution over the stations with and without heuristic rerun removal.

Source Duration (hours) Reruns removed P1 145.160 False 97.401 True Radio24syv 70.422 False 44.569 True

Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set?

The dataset contains all shows from the two stations in the time period (2005-2021 for DR P1 and 2011-2019 for Radio24syv).

If the dataset is a sample from a larger set, what was the sampling strategy?

The dataset is a subset of all Danish radio. The two stations were chosen for the dataset as they are talk-radio only.

Who was involved in the data collection process?

The RDL collects Danish radio shows and constructed DaRadio for handing to researchers at CHC.

Over what timeframe was the data collected?

The dataset includes radio shows from the period 2005 to 2021.

Were any ethical review processes conducted?

The RDL collects radio shows in adherence to Danish Archival laws. DaRadio was constructed for a research project, for which a project proposal was accepted by RDL. No other ethical review processes were conducted.

"},{"location":"datasheets/daradio/#preprocessingcleaninglabeling","title":"Preprocessing/cleaning/labeling","text":"

Was any preprocessing/Cleaning/Labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)?

DaRadio has been deduplicated using a series of heuristic filters and all files have been converted to 16 Khz .wav files.

Reruns/duplicates were identified by the following rules:

  • If the phrase \"sendt f\u00f8rste gang\" [\"aired the first time\"] or \"genudsendelse\" [\"rerun\"] appeared in the show description.
  • If the title contained \"(G)\" (short for \"genudsendelse\"))
  • If the show was broadcast between 23:00 and 5:00.

The deduplication was coded and conducted by researchers at CHC.

Is the software used to preprocess/clean/label the instances available?

The scripts are available at the following GitHub repository: link.

"},{"location":"datasheets/daradio/#uses","title":"Uses","text":"

Has the dataset been used for any tasks already?

Yes, the dataset has been used to pre-train a Danish wav2vec2.0 model.

Is there a repository that links to any or all papers or systems that use the dataset?

No, but as of 23/10/16 no others have used the dataset.

What (other) tasks could the dataset be used for?

As the dataset only contains un-labelled data, i.e. no transcriptions, it is mainly designed for pre-training language models. However, given the metadata and re-occuring hosts, further processing might make it possible to train e.g. text-to-speech systems.

Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses?

This dataset is static and does not evolve over time with the language, thus will become increasingly outdated over time.

"},{"location":"datasheets/daradio/#distribution","title":"Distribution","text":"

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

Data will only be available at the entity during the project. An equivalent or updated dataset can be requested at the Royal Danish Library.

"},{"location":"datasheets/daradio/#citation","title":"Citation","text":"

If you wish to cite this work please see our GitHub page for an up to date citation: https://github.com/centre-for-humanities-computing/danish-foundation-models

"},{"location":"datasheets/daradio/#references","title":"References:","text":"
  • [1] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daum\u00e9 III, and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
"},{"location":"datasheets/hopetwitter/","title":"HopeTwitter","text":"

Version: 1.0.0

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

license: Not publicly available.

HopeTwitter consists of tweets collected from the Twitter API using a stopword list and consists of 32.5 million tweets across 538,398 unique users. HopeTwitter includes tweets from 2019-01-01 to 2021-04-30.

HopeTwitter, have been filtered to only include Danish tweets, based on language tag from Twitter API. Similarly, HopeTwitter have had low-quality tweets have removed and then deduplicated to remove exact and near-duplicates. For more on data cleaning see section; \"Preprocessing/cleaning/labeling\".

HopeTwitter includes a total of 0.97 billion tokens before filtering and includes 0.48 billion (50%) after.

"},{"location":"datasheets/hopetwitter/#datasheet","title":"Datasheet","text":"

Following the recommendation and framework of [3] we add the following datasheet.

"},{"location":"datasheets/hopetwitter/#motivation","title":"Motivation","text":"

**For what purpose was the dataset created? Who created the dataset? Who funded the creation of the dataset? **

HopeTwitter was initially collected as a part of the HOPE project, examining societal behaviour during the COVID-19 pandemic. Next, HopeTwitter was cleaned in preparation for pre-training Danish language models by a team of researchers at Center for Humanities Computing Aarhus (CHCAA), using a codebase jointly developed with partners from academia and industry, including KMD, Ekstra Bladet, Bristol University and Deepdivr. For more on collaborators on this project see the GitHub repository.

Any other comments?

No.

"},{"location":"datasheets/hopetwitter/#composition","title":"Composition","text":"

What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)?

HopeTwitter consists of tweets containing at least one of a series of stopwords, collected through the Twitter API. See \"If the dataset is a sample from a larger set, what was the sampling strategy?\" for the stopword list.

How many instances are there in total (of each type, if appropriate)?

The dataset consist of 32,499,019 documents where 14,399,284 (44%) were considered duplicates.

Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set?

No. It does not contain all instances of Danish Twitter as there are likely some Danish tweets which does not include a stopword.

Is there a label or target associated with each instance? If so, please provide a description.

No.

Are there recommended data splits (e.g., training, development/validation, testing)? If so, please provide a description of these splits, explaining the rationale behind them.

No splits are performed on this dataset.

If the dataset is a sample from a larger set, what was the sampling strategy?

Tweets are streamed continuously using queried a set of the highest frequency Scandinavian-specific keywords from Danish, Norwegian (Bokm\u00e5l) and Swedish, resulting in the following list:

aften, aldrig, alltid, altid, andet, arbejde, bedste, beh\u00f6ver, beh\u00f8ver, beklager,\nber\u00e4tta, betyr, blev, blevet, blir, blitt, blive, bliver, bruge, burde, b\u00e4ttre, b\u00e5e\nb\u00f8r, deim, deires, ditt, drar, drepe, dykk, dykkar, d\u00e4r, d\u00f6d, d\u00f6da, d\u00f8d, d\u00f8de, efter,\nelsker, endnu, faen, fandt, feil, fikk, finner, flere, forst\u00e5r, fortelle, fortfarande,\nfortsatt, fort\u00e6lle, fr\u00e5n, f\u00e5, f\u00e5et, f\u00e5r, f\u00e5tt, f\u00f6rl\u00e5t, f\u00f6rsta, f\u00f6rs\u00f6ker, f\u00f8r, f\u00f8rst,\nf\u00f8rste, gick, gikk, gillar, gjennom, gjerne, gjorde, gjort, gj\u00f8r, gj\u00f8re, godt, g\u00e5, g\u00e5ng,\ng\u00e5r, g\u00f6ra, g\u00f8r, g\u00f8re, hadde, hall\u00e5, havde, hedder, helt, helvete, hende, hendes, hennes,\nherregud, hjelp, hjelpe, hjem, hj\u00e4lp, hj\u00e5, hj\u00e6lp, hj\u00e6lpe, honom, hossen, hvem, hvis,\nhvordan, hvorfor, h\u00e4nder, h\u00e4r, h\u00e5ll, h\u00e5ller, h\u00f8r, h\u00f8re, h\u00f8rer, igjen, ikkje, ingenting,\ninkje, inte, intet, jeres, j\u00e4vla, kanske, kanskje, kender, kjenner, korleis, kvarhelst,\nkveld, kven, kvifor, k\u00e4nner, ledsen, lenger, lidt, livet, l\u00e4ngre, l\u00e5t, l\u00e5ter, l\u00e6nge,\nmeget, menar, mycket, mykje, m\u00e5, m\u00e5de, m\u00e5nga, m\u00e5r, m\u00e5ske, m\u00e5ste, m\u00e5tte, navn, nogen,\nnoget, nogle, noko, nokon, nokor, nokre, n\u00e5gon, n\u00e5got, n\u00e5gra, n\u00e5n, n\u00e5r, n\u00e5t, n\u00f8dt,\nocks\u00e5, ogs\u00e5, pengar, penger, pratar, pr\u00f8ver, p\u00e5, redan, rundt, r\u00e4tt, sagde, saker,\nsamma, sammen, selv, selvf\u00f8lgelig, sidan, sidste, siger, sikker, sikkert, sj\u00e4lv, skete,\nskjedde, skjer, skulle, sluta, slutt, snakke, snakker, snill, sn\u00e4lla, somt, stadig,\nstanna, sted, st\u00e5r, synes, s\u00e4ger, s\u00e4tt, s\u00e5, s\u00e5dan, s\u00e5g, s\u00e5nn, tager, tiden, tilbage,\ntilbake, tillbaka, titta, trenger, trodde, troede, tror, tv\u00e5, tycker, t\u00e4nker, uden,\nundskyld, unnskyld, urs\u00e4kta, uten, varf\u00f6r, varit, varte, veldig, venner, verkligen,\nvidste, vilken, virkelig, visste, v\u00e4g, v\u00e4l, v\u00e4ldigt, v\u00e4n, v\u00e5r, v\u00e5ra, v\u00e5re, v\u00e6k, v\u00e6r, \nv\u00e6re, v\u00e6ret, \u00e4lskar, \u00e5h, \u00e5r, \u00e5t, \u00f6ver\n

Who was involved in the data collection process?

A team of researchers at the Center for Humanities Computing Aarhus (CHCAA), including Kristoffer Nielbo and Peter Bjerregaard Vahlstrup, in collaboration with Rebekah Baglini, at the School of Communcation and Culture at Aarhus university.

Over what timeframe was the data collected?

The dataset includes tweets from the period 2019-01-01 to 2021-04-30.

Were any ethical review processes conducted?

No

"},{"location":"datasheets/hopetwitter/#preprocessingcleaninglabeling","title":"Preprocessing/cleaning/labeling","text":"

Was any preprocessing/Cleaning/Labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)?

Firstly, HopeTwitter had non-Danish tweets removed, after which a series of heuristic filters were applied, including the removal of repetitious texts. Following the filtering, HopeTwitter was deduplicated, removing both exact duplicates and near-duplicates.

Of all documents, 3,023,427 (9%) were filtered due to low-quality and 14,399,284 (33%) because they were near-duplicates.

For the quality filtering, HopeTwitter applies a filter akin to [2] which contains text that:

  • Contain at least 2 Danish stopwords. For the stopword list we use the one used in SpaCy v.3.1.4.
  • Have a mean word length between 2 and 14.
  • Have a token length between 10 and 100,000.
  • Have less than 5,000,000 characters.
  • Have less than 60% of words containing an alphabetic character.

  • Have low high degree of repetitious text:

  • Have less than 20% of characters contained within duplicate lines.
  • Have less than 20% of characters contained within duplicate paragraphs.
  • Where the top 2-4 grams constitute less than 20%, 18%, 16%, respectively, of the text.
  • Where the duplicate 5-10 grams constitute less than 25%, 24%, 23%, 22%, 21%, 20% of the text, respectively.

The deduplication removed all documents with a 10-gram Jaccard similarity higher than 80% following the MinHash algorithm [1] using 128 permutations. The MinHash algorithm is a probabilistic data structure for approximating the Jaccard similarity between two sets.

Is the software used to preprocess/clean/label the instances available?

Yes, the scripts are available here. The scripts use version 0.0.2 of the dfm package.

"},{"location":"datasheets/hopetwitter/#uses","title":"Uses","text":"

Has the dataset been used for any tasks already?

Yes, the dataset has been used to pre-train Danish language models. Parts of the dataset have also been used in HOPE project reports and in [4].

Is there a repository that links to any or all papers or systems that use the dataset?

There is a website for the HOPE project for which the dataset was initially collected. This website contains report and articles regarding the dataset.

What (other) tasks could the dataset be used for?

The scale of the dataset makes it suitable for NLP tasks such as language modelling. Similarly, one could imagine using the conversation structure could be used to train conversational chatbots.

Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses?

This dataset is static and thus does not evolve over time with the language. A consequence of this is that it will become increasingly outdated over time. However, it possible to extend the dataset by a continual collection of tweets.

Are there tasks for which the dataset should not be used?

HopeTwitter contains Danish tweets and thus should not be used for non-Danish language tasks.

As the writers of the content is predominantly journalists, politicians, influencers, and academics, it reflects a certain social group which is unlikely to reflect Danish population as a whole.

"},{"location":"datasheets/hopetwitter/#distribution","title":"Distribution","text":"

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

Data will only be available at the entity during the project. After the project the data will be archived for a period of five years to comply with the university [policy] for research integrity. After the five years, the data will be registered at the national archives as required by executive order 514 for potential long-term deposit.

"},{"location":"datasheets/hopetwitter/#citation","title":"Citation","text":"

If you wish to cite this work please see our GitHub page for an up to date citation: https://github.com/centre-for-humanities-computing/danish-foundation-models

"},{"location":"datasheets/hopetwitter/#references","title":"References:","text":"
  • [1] Broder, Andrei Z. \"On the resemblance and containment of documents.\" Proceedings. Compression and Complexity of SEQUENCES 1997 (Cat. No. 97TB100171). IEEE, 1997.
  • [2] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., Aslanides, J., Henderson, S., Ring, R., Young, S., Rutherford, E., Hennigan, T., Menick, J., Cassirer, A., Powell, R., Driessche, G. van den, Hendricks, L. A., Rauh, M., Huang, P.-S., \u2026 Irving, G. (2021). Scaling Language Models: Methods, Analysis & Insights from Training Gopher. https://arxiv.org/abs/2112.11446v2
  • [3] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daum\u00e9 III, and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
  • [4]\u00a0Johansen, N., Marjanovic, S. V., Kjaer, C. V., Baglini, R. B., & Adler-Nissen, R. (2022). Ridiculing the \u201ctinfoil hats:\u201d Citizen responses to COVID-19 misinformation in the Danish facemask debate on Twitter. Harvard Kennedy School Misinformation Review. https://doi.org/10.37016/mr-2020-93
"},{"location":"datasheets/netarkivet_text/","title":"NAT: Netarkivet Text","text":"

Version: 1.0.0

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

license: Not publicly available.

Netarkivet Text (NAT) consists of a subsection of Netarkivet and contains 2,332 million sites across 1.6 million domains. Netarkivet includes sites from the period 2006 to 2016.

NAT has been filtered using a series of heuristic filters and removing repetitious texts. Following the filtering, NAT is further deduplicated to remove exact and near-duplicates. For more on data cleaning, see the post processing section below.

The sites which passed the quality filter were deduplicated per year. NAT consist of 865 billion tokens of which 134 (15%) billion were left after filtering and deduplication.

"},{"location":"datasheets/netarkivet_text/#datasheet","title":"Datasheet","text":"

Following the recommendation and framework of [3], we add the following datasheet.

"},{"location":"datasheets/netarkivet_text/#motivation","title":"Motivation:","text":"

For what purpose was the dataset created? Who created the dataset? Who funded the creation of the dataset?

Netarkivet was created following the Danish Legal Deposit Act, from which a text-only corpus was derived for research purposes, see [4,5]. This is the part from which this dataset is derived. This part has then been filtered with the intention of training Danish language

models by a team of researchers at the Center for Humanities Computing Aarhus (CHCAA) using a codebase jointly developed with partners from industry (e.g. KMD, Ekstra Bladet) and other research institutions (e.g. Bristol University, Alexandra Institute). For more on collaborators on this project see the GitHub repository.

Any other comments?

No.

"},{"location":"datasheets/netarkivet_text/#composition","title":"Composition","text":"

What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)?

Instances of the dataset are Danish domain sites, which further include metadata such as:

Column Dtype 0 harvest_id int32 1 job_id int32 2 sha1 object 3 mime_served object 4 language object 5 mime_droid object 6 timestamp object 7 uri object 9 domain_key object

Where harvest_id is the id of the associated Netarkivet web harvest. Each web harvest consists of jobs, each with their associated job-id.

Language is the language classified using the following language detection library. uri is the URI of the site e.g. \"http://www.apple.com/podcasting\". timestamp is the date given in the format \"20060612105533\", indicating year, month, date, and time. The sha1 is the website hash. mime_* indicates the mime/media type. mime_served could for instance be \"text/html; charset=iso-8859-1\" and mime_droid could be \"text/html; version=2.0\" and is the mime type identified by the server and by DROID, respectively. How many instances are there in total (of each type, if appropriate)?

NAT contains a total of 2,332 million sites distributed over 1.6 million domains. 1,370 million of these sites are Danish, with the largest secondary language being English with 718 million sites.

Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set?

These domains are a subset of Netarkivet, which again is a sample of all the Danish content on the internet.

If the dataset is a sample from a larger set, what was the sampling strategy?

Netarkivet has been scraped from the internet using the following procedures:

  • Cross-sectional collection of all Danish websites up to four times a year.
  • Selective collection of the following domains: All Danish newsmedia (with a frequency ranging from 12 times a day to weekly), political parties, organizations and unions, legal bodies such as ministries and agencies, selected social media profiles, and YouTube videos.
  • Event collections of 2-3 events yearly (e.g. elections and the COVID pandemic)
  • Miscellaneous/on-demand web scrapes (for instance in collaboration with researchers)

A selective subset of Netarkivet is then extracted per year from 2006 to 2016 such that it contains no duplicate sites. Apache Tika (v. 1.15) is then used to extract the text from the sites. During extraction, all HTML markup is removed, along with javascript and CSS code. The news media of textual HTML elements, such as <P> and <H1> are concatenated into one piece of text.

Who was involved in the data collection process?

The Royal Danish Library collects Netarkivet along with Br\u00fcgger et al. [4,5] helped with the construction of NAT.

Over what timeframe was the data collected?

The dataset includes articles from the period 2006 to 2016.

Were any ethical review processes conducted?

Netarkivet in collected in adherence to an update to the Danish archival law in 2005, which extended the law to also include internet domains.

Our text subset was constructed for a research project and thus a project proposal has been accepted by the Royal Danish Library. Besides these, the author is not aware of any ethical approvals.

"},{"location":"datasheets/netarkivet_text/#preprocessingcleaninglabeling","title":"Preprocessing/cleaning/labeling","text":"

Was any preprocessing/Cleaning/Labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)?

NAT has been filtered using a series of heuristic filters as well as removing repetitious texts. Following the filtering, the corpus was deduplicated to remove exact and near-duplicates.

For quality filtering, NAT applies a filter akin to [2] which contains text that:

  • Contain at least 2 Danish stopwords. For the stopword list, we use the one used in SpaCy v.3.1.4.
  • Have a mean word length between 3 and 10.
  • Have a token length between 50 and 100,000.
  • Have less than 5,000,000 characters.
  • Have less than 60% of words containing an alphabetic character.
  • Have a symbol-to-word ratio lower than 10% for hashtags and ellipsis.
  • Have less than 90% of lines starting with a bullet point.
  • have less than 30% of lines ending with an ellipsis.

  • Have a low high degree of repetitious text:

  • Have less than 20% of characters contained within duplicate lines.
  • Have less than 20% of characters contained within duplicate paragraphs.
  • Where the top 2-4 grams constitute less than 20%, 18%, 16%, respectively, of the text.
  • Where the duplicate 5-10 grams constitute less than 25%, 24%, 23%, 22%, 21%, 20% of the text, respectively.

The deduplication removed all documents with a 13-gram Jaccard similarity higher than 80% following the MinHash algorithm [1] using 128 permutations. The MinHash algorithm is a probabilistic data structure for approximating the Jaccard similarity between two sets.

Is the software used to preprocess/clean/label the instances available?

Yes, the scripts are available here. the scripts use version 0.0.2 of the dfm package.

"},{"location":"datasheets/netarkivet_text/#uses","title":"Uses","text":"

Has the dataset been used for any tasks already?

Yes, the dataset has been used to pre-train Danish language models. Furthermore, the unfiltered dataset has also been used in [4] and [5], for examining the development of the Danish web.

Is there a repository that links to any or all papers or systems that use the dataset?

No.

What (other) tasks could the dataset be used for?

The scale of the dataset makes it suitable for NLP tasks such as language modelling. It is likely possible to extract reviews, social media posts and similar semi-labelled datasets from the dataset which can be used for NLP task such as sentiment analysis or hate-speech detection.

The content of dataset makes it useable in a wide range of other applications in media studies, social science or humanities, including development of written Danish, emerging conspiracy theories, and online information dynamics.

Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses?

This dataset is static and thus does not evolve over time with the language, thus will become increasingly outdated over time. Netarkivet, from which it is derived, is not static however, and is thus likely to further develop, which will allow us to update the dataset going forward.

"},{"location":"datasheets/netarkivet_text/#distribution","title":"Distribution","text":"

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

Data will only be available at the entity during the project. An equivalent or updated dataset can be requested at the Royal Danish Library.

"},{"location":"datasheets/netarkivet_text/#citation","title":"Citation","text":"

If you wish to cite this work please see our GitHub page for an up to date citation: https://github.com/centre-for-humanities-computing/danish-foundation-models

"},{"location":"datasheets/netarkivet_text/#references","title":"References:","text":"
  • [1] Broder, Andrei Z. \"On the resemblance and containment of documents.\" Proceedings. Compression and Complexity of SEQUENCES 1997 (Cat. No. 97TB100171). IEEE, 1997.
  • [2] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., Aslanides, J., Henderson, S., Ring, R., Young, S., Rutherford, E., Hennigan, T., Menick, J., Cassirer, A., Powell, R., Driessche, G. van den, Hendricks, L. A., Rauh, M., Huang, P.-S., \u2026 Irving, G. (2021). Scaling Language Models: Methods, Analysis & Insights from Training Gopher. https://arxiv.org/abs/2112.11446v2
  • [3] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daum\u00e9 III, and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
  • [4] Br\u00fcgger, N., Nielsen, J., & Laursen, D. (2020). Big data experiments with the archived Web: Methodological reflections on studying the development of a nation\u2019s Web. First Monday. https://doi.org/10.5210/fm.v25i3.10384
  • [5] Br\u00fcgger, N. (2021). Digital humanities and web archives: Possible new paths for combining datasets. International Journal of Digital Humanities, 2(1), 145\u2013168. https://doi.org/10.1007/s42803-021-00038-z
"}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 00000000..0f8724ef --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz new file mode 100644 index 00000000..3156aac7 Binary files /dev/null and b/sitemap.xml.gz differ