diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..b0a4b4f3 --- /dev/null +++ b/404.html @@ -0,0 +1,518 @@ + + + + + + + + + + + + + + + + + + Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ +

404 - Not found

+ +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/_static/collab.png b/_static/collab.png new file mode 100644 index 00000000..cadfe05c Binary files /dev/null and b/_static/collab.png differ diff --git a/_static/dev_container.png b/_static/dev_container.png new file mode 100644 index 00000000..eeccffd5 Binary files /dev/null and b/_static/dev_container.png differ diff --git a/_static/icon.png b/_static/icon.png new file mode 100644 index 00000000..4594eebd Binary files /dev/null and b/_static/icon.png differ diff --git a/_static/logo.png b/_static/logo.png new file mode 100644 index 00000000..a0b81d43 Binary files /dev/null and b/_static/logo.png differ diff --git a/_static/structure.png b/_static/structure.png new file mode 100644 index 00000000..5c24490f Binary files /dev/null and b/_static/structure.png differ diff --git a/assets/images/favicon.png b/assets/images/favicon.png new file mode 100644 index 00000000..1cf13b9f Binary files /dev/null and b/assets/images/favicon.png differ diff --git a/assets/javascripts/bundle.220ee61c.min.js b/assets/javascripts/bundle.220ee61c.min.js new file mode 100644 index 00000000..116072a1 --- /dev/null +++ b/assets/javascripts/bundle.220ee61c.min.js @@ -0,0 +1,29 @@ +"use strict";(()=>{var Ci=Object.create;var gr=Object.defineProperty;var Ri=Object.getOwnPropertyDescriptor;var ki=Object.getOwnPropertyNames,Ht=Object.getOwnPropertySymbols,Hi=Object.getPrototypeOf,yr=Object.prototype.hasOwnProperty,nn=Object.prototype.propertyIsEnumerable;var rn=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,P=(e,t)=>{for(var r in t||(t={}))yr.call(t,r)&&rn(e,r,t[r]);if(Ht)for(var r of Ht(t))nn.call(t,r)&&rn(e,r,t[r]);return e};var on=(e,t)=>{var r={};for(var n in e)yr.call(e,n)&&t.indexOf(n)<0&&(r[n]=e[n]);if(e!=null&&Ht)for(var n of Ht(e))t.indexOf(n)<0&&nn.call(e,n)&&(r[n]=e[n]);return r};var Pt=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var Pi=(e,t,r,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of ki(t))!yr.call(e,o)&&o!==r&&gr(e,o,{get:()=>t[o],enumerable:!(n=Ri(t,o))||n.enumerable});return e};var yt=(e,t,r)=>(r=e!=null?Ci(Hi(e)):{},Pi(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var sn=Pt((xr,an)=>{(function(e,t){typeof xr=="object"&&typeof an!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(xr,function(){"use strict";function e(r){var n=!0,o=!1,i=null,s={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function a(O){return!!(O&&O!==document&&O.nodeName!=="HTML"&&O.nodeName!=="BODY"&&"classList"in O&&"contains"in O.classList)}function f(O){var Qe=O.type,De=O.tagName;return!!(De==="INPUT"&&s[Qe]&&!O.readOnly||De==="TEXTAREA"&&!O.readOnly||O.isContentEditable)}function c(O){O.classList.contains("focus-visible")||(O.classList.add("focus-visible"),O.setAttribute("data-focus-visible-added",""))}function u(O){O.hasAttribute("data-focus-visible-added")&&(O.classList.remove("focus-visible"),O.removeAttribute("data-focus-visible-added"))}function p(O){O.metaKey||O.altKey||O.ctrlKey||(a(r.activeElement)&&c(r.activeElement),n=!0)}function m(O){n=!1}function d(O){a(O.target)&&(n||f(O.target))&&c(O.target)}function h(O){a(O.target)&&(O.target.classList.contains("focus-visible")||O.target.hasAttribute("data-focus-visible-added"))&&(o=!0,window.clearTimeout(i),i=window.setTimeout(function(){o=!1},100),u(O.target))}function v(O){document.visibilityState==="hidden"&&(o&&(n=!0),Y())}function Y(){document.addEventListener("mousemove",N),document.addEventListener("mousedown",N),document.addEventListener("mouseup",N),document.addEventListener("pointermove",N),document.addEventListener("pointerdown",N),document.addEventListener("pointerup",N),document.addEventListener("touchmove",N),document.addEventListener("touchstart",N),document.addEventListener("touchend",N)}function B(){document.removeEventListener("mousemove",N),document.removeEventListener("mousedown",N),document.removeEventListener("mouseup",N),document.removeEventListener("pointermove",N),document.removeEventListener("pointerdown",N),document.removeEventListener("pointerup",N),document.removeEventListener("touchmove",N),document.removeEventListener("touchstart",N),document.removeEventListener("touchend",N)}function N(O){O.target.nodeName&&O.target.nodeName.toLowerCase()==="html"||(n=!1,B())}document.addEventListener("keydown",p,!0),document.addEventListener("mousedown",m,!0),document.addEventListener("pointerdown",m,!0),document.addEventListener("touchstart",m,!0),document.addEventListener("visibilitychange",v,!0),Y(),r.addEventListener("focus",d,!0),r.addEventListener("blur",h,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var cn=Pt(Er=>{(function(e){var t=function(){try{return!!Symbol.iterator}catch(c){return!1}},r=t(),n=function(c){var u={next:function(){var p=c.shift();return{done:p===void 0,value:p}}};return r&&(u[Symbol.iterator]=function(){return u}),u},o=function(c){return encodeURIComponent(c).replace(/%20/g,"+")},i=function(c){return decodeURIComponent(String(c).replace(/\+/g," "))},s=function(){var c=function(p){Object.defineProperty(this,"_entries",{writable:!0,value:{}});var m=typeof p;if(m!=="undefined")if(m==="string")p!==""&&this._fromString(p);else if(p instanceof c){var d=this;p.forEach(function(B,N){d.append(N,B)})}else if(p!==null&&m==="object")if(Object.prototype.toString.call(p)==="[object Array]")for(var h=0;hd[0]?1:0}),c._entries&&(c._entries={});for(var p=0;p1?i(d[1]):"")}})})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er);(function(e){var t=function(){try{var o=new e.URL("b","http://a");return o.pathname="c d",o.href==="http://a/c%20d"&&o.searchParams}catch(i){return!1}},r=function(){var o=e.URL,i=function(f,c){typeof f!="string"&&(f=String(f)),c&&typeof c!="string"&&(c=String(c));var u=document,p;if(c&&(e.location===void 0||c!==e.location.href)){c=c.toLowerCase(),u=document.implementation.createHTMLDocument(""),p=u.createElement("base"),p.href=c,u.head.appendChild(p);try{if(p.href.indexOf(c)!==0)throw new Error(p.href)}catch(O){throw new Error("URL unable to set base "+c+" due to "+O)}}var m=u.createElement("a");m.href=f,p&&(u.body.appendChild(m),m.href=m.href);var d=u.createElement("input");if(d.type="url",d.value=f,m.protocol===":"||!/:/.test(m.href)||!d.checkValidity()&&!c)throw new TypeError("Invalid URL");Object.defineProperty(this,"_anchorElement",{value:m});var h=new e.URLSearchParams(this.search),v=!0,Y=!0,B=this;["append","delete","set"].forEach(function(O){var Qe=h[O];h[O]=function(){Qe.apply(h,arguments),v&&(Y=!1,B.search=h.toString(),Y=!0)}}),Object.defineProperty(this,"searchParams",{value:h,enumerable:!0});var N=void 0;Object.defineProperty(this,"_updateSearchParams",{enumerable:!1,configurable:!1,writable:!1,value:function(){this.search!==N&&(N=this.search,Y&&(v=!1,this.searchParams._fromString(this.search),v=!0))}})},s=i.prototype,a=function(f){Object.defineProperty(s,f,{get:function(){return this._anchorElement[f]},set:function(c){this._anchorElement[f]=c},enumerable:!0})};["hash","host","hostname","port","protocol"].forEach(function(f){a(f)}),Object.defineProperty(s,"search",{get:function(){return this._anchorElement.search},set:function(f){this._anchorElement.search=f,this._updateSearchParams()},enumerable:!0}),Object.defineProperties(s,{toString:{get:function(){var f=this;return function(){return f.href}}},href:{get:function(){return this._anchorElement.href.replace(/\?$/,"")},set:function(f){this._anchorElement.href=f,this._updateSearchParams()},enumerable:!0},pathname:{get:function(){return this._anchorElement.pathname.replace(/(^\/?)/,"/")},set:function(f){this._anchorElement.pathname=f},enumerable:!0},origin:{get:function(){var f={"http:":80,"https:":443,"ftp:":21}[this._anchorElement.protocol],c=this._anchorElement.port!=f&&this._anchorElement.port!=="";return this._anchorElement.protocol+"//"+this._anchorElement.hostname+(c?":"+this._anchorElement.port:"")},enumerable:!0},password:{get:function(){return""},set:function(f){},enumerable:!0},username:{get:function(){return""},set:function(f){},enumerable:!0}}),i.createObjectURL=function(f){return o.createObjectURL.apply(o,arguments)},i.revokeObjectURL=function(f){return o.revokeObjectURL.apply(o,arguments)},e.URL=i};if(t()||r(),e.location!==void 0&&!("origin"in e.location)){var n=function(){return e.location.protocol+"//"+e.location.hostname+(e.location.port?":"+e.location.port:"")};try{Object.defineProperty(e.location,"origin",{get:n,enumerable:!0})}catch(o){setInterval(function(){e.location.origin=n()},100)}}})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er)});var qr=Pt((Mt,Nr)=>{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof Mt=="object"&&typeof Nr=="object"?Nr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof Mt=="object"?Mt.ClipboardJS=r():t.ClipboardJS=r()})(Mt,function(){return function(){var e={686:function(n,o,i){"use strict";i.d(o,{default:function(){return Ai}});var s=i(279),a=i.n(s),f=i(370),c=i.n(f),u=i(817),p=i.n(u);function m(j){try{return document.execCommand(j)}catch(T){return!1}}var d=function(T){var E=p()(T);return m("cut"),E},h=d;function v(j){var T=document.documentElement.getAttribute("dir")==="rtl",E=document.createElement("textarea");E.style.fontSize="12pt",E.style.border="0",E.style.padding="0",E.style.margin="0",E.style.position="absolute",E.style[T?"right":"left"]="-9999px";var H=window.pageYOffset||document.documentElement.scrollTop;return E.style.top="".concat(H,"px"),E.setAttribute("readonly",""),E.value=j,E}var Y=function(T,E){var H=v(T);E.container.appendChild(H);var I=p()(H);return m("copy"),H.remove(),I},B=function(T){var E=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},H="";return typeof T=="string"?H=Y(T,E):T instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(T==null?void 0:T.type)?H=Y(T.value,E):(H=p()(T),m("copy")),H},N=B;function O(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?O=function(E){return typeof E}:O=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},O(j)}var Qe=function(){var T=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},E=T.action,H=E===void 0?"copy":E,I=T.container,q=T.target,Me=T.text;if(H!=="copy"&&H!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(q!==void 0)if(q&&O(q)==="object"&&q.nodeType===1){if(H==="copy"&&q.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(H==="cut"&&(q.hasAttribute("readonly")||q.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(Me)return N(Me,{container:I});if(q)return H==="cut"?h(q):N(q,{container:I})},De=Qe;function $e(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?$e=function(E){return typeof E}:$e=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},$e(j)}function Ei(j,T){if(!(j instanceof T))throw new TypeError("Cannot call a class as a function")}function tn(j,T){for(var E=0;E0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof I.action=="function"?I.action:this.defaultAction,this.target=typeof I.target=="function"?I.target:this.defaultTarget,this.text=typeof I.text=="function"?I.text:this.defaultText,this.container=$e(I.container)==="object"?I.container:document.body}},{key:"listenClick",value:function(I){var q=this;this.listener=c()(I,"click",function(Me){return q.onClick(Me)})}},{key:"onClick",value:function(I){var q=I.delegateTarget||I.currentTarget,Me=this.action(q)||"copy",kt=De({action:Me,container:this.container,target:this.target(q),text:this.text(q)});this.emit(kt?"success":"error",{action:Me,text:kt,trigger:q,clearSelection:function(){q&&q.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(I){return vr("action",I)}},{key:"defaultTarget",value:function(I){var q=vr("target",I);if(q)return document.querySelector(q)}},{key:"defaultText",value:function(I){return vr("text",I)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(I){var q=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return N(I,q)}},{key:"cut",value:function(I){return h(I)}},{key:"isSupported",value:function(){var I=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],q=typeof I=="string"?[I]:I,Me=!!document.queryCommandSupported;return q.forEach(function(kt){Me=Me&&!!document.queryCommandSupported(kt)}),Me}}]),E}(a()),Ai=Li},828:function(n){var o=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function s(a,f){for(;a&&a.nodeType!==o;){if(typeof a.matches=="function"&&a.matches(f))return a;a=a.parentNode}}n.exports=s},438:function(n,o,i){var s=i(828);function a(u,p,m,d,h){var v=c.apply(this,arguments);return u.addEventListener(m,v,h),{destroy:function(){u.removeEventListener(m,v,h)}}}function f(u,p,m,d,h){return typeof u.addEventListener=="function"?a.apply(null,arguments):typeof m=="function"?a.bind(null,document).apply(null,arguments):(typeof u=="string"&&(u=document.querySelectorAll(u)),Array.prototype.map.call(u,function(v){return a(v,p,m,d,h)}))}function c(u,p,m,d){return function(h){h.delegateTarget=s(h.target,p),h.delegateTarget&&d.call(u,h)}}n.exports=f},879:function(n,o){o.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},o.nodeList=function(i){var s=Object.prototype.toString.call(i);return i!==void 0&&(s==="[object NodeList]"||s==="[object HTMLCollection]")&&"length"in i&&(i.length===0||o.node(i[0]))},o.string=function(i){return typeof i=="string"||i instanceof String},o.fn=function(i){var s=Object.prototype.toString.call(i);return s==="[object Function]"}},370:function(n,o,i){var s=i(879),a=i(438);function f(m,d,h){if(!m&&!d&&!h)throw new Error("Missing required arguments");if(!s.string(d))throw new TypeError("Second argument must be a String");if(!s.fn(h))throw new TypeError("Third argument must be a Function");if(s.node(m))return c(m,d,h);if(s.nodeList(m))return u(m,d,h);if(s.string(m))return p(m,d,h);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(m,d,h){return m.addEventListener(d,h),{destroy:function(){m.removeEventListener(d,h)}}}function u(m,d,h){return Array.prototype.forEach.call(m,function(v){v.addEventListener(d,h)}),{destroy:function(){Array.prototype.forEach.call(m,function(v){v.removeEventListener(d,h)})}}}function p(m,d,h){return a(document.body,m,d,h)}n.exports=f},817:function(n){function o(i){var s;if(i.nodeName==="SELECT")i.focus(),s=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var a=i.hasAttribute("readonly");a||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),a||i.removeAttribute("readonly"),s=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var f=window.getSelection(),c=document.createRange();c.selectNodeContents(i),f.removeAllRanges(),f.addRange(c),s=f.toString()}return s}n.exports=o},279:function(n){function o(){}o.prototype={on:function(i,s,a){var f=this.e||(this.e={});return(f[i]||(f[i]=[])).push({fn:s,ctx:a}),this},once:function(i,s,a){var f=this;function c(){f.off(i,c),s.apply(a,arguments)}return c._=s,this.on(i,c,a)},emit:function(i){var s=[].slice.call(arguments,1),a=((this.e||(this.e={}))[i]||[]).slice(),f=0,c=a.length;for(f;f{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var rs=/["'&<>]/;Yo.exports=ns;function ns(e){var t=""+e,r=rs.exec(t);if(!r)return t;var n,o="",i=0,s=0;for(i=r.index;i0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]=e.length&&(e=void 0),{value:e&&e[n++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function W(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var n=r.call(e),o,i=[],s;try{for(;(t===void 0||t-- >0)&&!(o=n.next()).done;)i.push(o.value)}catch(a){s={error:a}}finally{try{o&&!o.done&&(r=n.return)&&r.call(n)}finally{if(s)throw s.error}}return i}function D(e,t,r){if(r||arguments.length===2)for(var n=0,o=t.length,i;n1||a(m,d)})})}function a(m,d){try{f(n[m](d))}catch(h){p(i[0][3],h)}}function f(m){m.value instanceof et?Promise.resolve(m.value.v).then(c,u):p(i[0][2],m)}function c(m){a("next",m)}function u(m){a("throw",m)}function p(m,d){m(d),i.shift(),i.length&&a(i[0][0],i[0][1])}}function pn(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof Ee=="function"?Ee(e):e[Symbol.iterator](),r={},n("next"),n("throw"),n("return"),r[Symbol.asyncIterator]=function(){return this},r);function n(i){r[i]=e[i]&&function(s){return new Promise(function(a,f){s=e[i](s),o(a,f,s.done,s.value)})}}function o(i,s,a,f){Promise.resolve(f).then(function(c){i({value:c,done:a})},s)}}function C(e){return typeof e=="function"}function at(e){var t=function(n){Error.call(n),n.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var It=at(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(n,o){return o+1+") "+n.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function Ve(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ie=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,n,o,i;if(!this.closed){this.closed=!0;var s=this._parentage;if(s)if(this._parentage=null,Array.isArray(s))try{for(var a=Ee(s),f=a.next();!f.done;f=a.next()){var c=f.value;c.remove(this)}}catch(v){t={error:v}}finally{try{f&&!f.done&&(r=a.return)&&r.call(a)}finally{if(t)throw t.error}}else s.remove(this);var u=this.initialTeardown;if(C(u))try{u()}catch(v){i=v instanceof It?v.errors:[v]}var p=this._finalizers;if(p){this._finalizers=null;try{for(var m=Ee(p),d=m.next();!d.done;d=m.next()){var h=d.value;try{ln(h)}catch(v){i=i!=null?i:[],v instanceof It?i=D(D([],W(i)),W(v.errors)):i.push(v)}}}catch(v){n={error:v}}finally{try{d&&!d.done&&(o=m.return)&&o.call(m)}finally{if(n)throw n.error}}}if(i)throw new It(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)ln(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Ve(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Ve(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Sr=Ie.EMPTY;function jt(e){return e instanceof Ie||e&&"closed"in e&&C(e.remove)&&C(e.add)&&C(e.unsubscribe)}function ln(e){C(e)?e():e.unsubscribe()}var Le={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var st={setTimeout:function(e,t){for(var r=[],n=2;n0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var n=this,o=this,i=o.hasError,s=o.isStopped,a=o.observers;return i||s?Sr:(this.currentObservers=null,a.push(r),new Ie(function(){n.currentObservers=null,Ve(a,r)}))},t.prototype._checkFinalizedStatuses=function(r){var n=this,o=n.hasError,i=n.thrownError,s=n.isStopped;o?r.error(i):s&&r.complete()},t.prototype.asObservable=function(){var r=new F;return r.source=this,r},t.create=function(r,n){return new xn(r,n)},t}(F);var xn=function(e){ie(t,e);function t(r,n){var o=e.call(this)||this;return o.destination=r,o.source=n,o}return t.prototype.next=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.next)===null||o===void 0||o.call(n,r)},t.prototype.error=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.error)===null||o===void 0||o.call(n,r)},t.prototype.complete=function(){var r,n;(n=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||n===void 0||n.call(r)},t.prototype._subscribe=function(r){var n,o;return(o=(n=this.source)===null||n===void 0?void 0:n.subscribe(r))!==null&&o!==void 0?o:Sr},t}(x);var Et={now:function(){return(Et.delegate||Date).now()},delegate:void 0};var wt=function(e){ie(t,e);function t(r,n,o){r===void 0&&(r=1/0),n===void 0&&(n=1/0),o===void 0&&(o=Et);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=n,i._timestampProvider=o,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=n===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,n),i}return t.prototype.next=function(r){var n=this,o=n.isStopped,i=n._buffer,s=n._infiniteTimeWindow,a=n._timestampProvider,f=n._windowTime;o||(i.push(r),!s&&i.push(a.now()+f)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var n=this._innerSubscribe(r),o=this,i=o._infiniteTimeWindow,s=o._buffer,a=s.slice(),f=0;f0?e.prototype.requestAsyncId.call(this,r,n,o):(r.actions.push(this),r._scheduled||(r._scheduled=ut.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,n,o){var i;if(o===void 0&&(o=0),o!=null?o>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,n,o);var s=r.actions;n!=null&&((i=s[s.length-1])===null||i===void 0?void 0:i.id)!==n&&(ut.cancelAnimationFrame(n),r._scheduled=void 0)},t}(Wt);var Sn=function(e){ie(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var n=this._scheduled;this._scheduled=void 0;var o=this.actions,i;r=r||o.shift();do if(i=r.execute(r.state,r.delay))break;while((r=o[0])&&r.id===n&&o.shift());if(this._active=!1,i){for(;(r=o[0])&&r.id===n&&o.shift();)r.unsubscribe();throw i}},t}(Dt);var Oe=new Sn(wn);var M=new F(function(e){return e.complete()});function Vt(e){return e&&C(e.schedule)}function Cr(e){return e[e.length-1]}function Ye(e){return C(Cr(e))?e.pop():void 0}function Te(e){return Vt(Cr(e))?e.pop():void 0}function zt(e,t){return typeof Cr(e)=="number"?e.pop():t}var pt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Nt(e){return C(e==null?void 0:e.then)}function qt(e){return C(e[ft])}function Kt(e){return Symbol.asyncIterator&&C(e==null?void 0:e[Symbol.asyncIterator])}function Qt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function zi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var Yt=zi();function Gt(e){return C(e==null?void 0:e[Yt])}function Bt(e){return un(this,arguments,function(){var r,n,o,i;return $t(this,function(s){switch(s.label){case 0:r=e.getReader(),s.label=1;case 1:s.trys.push([1,,9,10]),s.label=2;case 2:return[4,et(r.read())];case 3:return n=s.sent(),o=n.value,i=n.done,i?[4,et(void 0)]:[3,5];case 4:return[2,s.sent()];case 5:return[4,et(o)];case 6:return[4,s.sent()];case 7:return s.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function Jt(e){return C(e==null?void 0:e.getReader)}function U(e){if(e instanceof F)return e;if(e!=null){if(qt(e))return Ni(e);if(pt(e))return qi(e);if(Nt(e))return Ki(e);if(Kt(e))return On(e);if(Gt(e))return Qi(e);if(Jt(e))return Yi(e)}throw Qt(e)}function Ni(e){return new F(function(t){var r=e[ft]();if(C(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function qi(e){return new F(function(t){for(var r=0;r=2;return function(n){return n.pipe(e?A(function(o,i){return e(o,i,n)}):de,ge(1),r?He(t):Dn(function(){return new Zt}))}}function Vn(){for(var e=[],t=0;t=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new x}:t,n=e.resetOnError,o=n===void 0?!0:n,i=e.resetOnComplete,s=i===void 0?!0:i,a=e.resetOnRefCountZero,f=a===void 0?!0:a;return function(c){var u,p,m,d=0,h=!1,v=!1,Y=function(){p==null||p.unsubscribe(),p=void 0},B=function(){Y(),u=m=void 0,h=v=!1},N=function(){var O=u;B(),O==null||O.unsubscribe()};return y(function(O,Qe){d++,!v&&!h&&Y();var De=m=m!=null?m:r();Qe.add(function(){d--,d===0&&!v&&!h&&(p=$r(N,f))}),De.subscribe(Qe),!u&&d>0&&(u=new rt({next:function($e){return De.next($e)},error:function($e){v=!0,Y(),p=$r(B,o,$e),De.error($e)},complete:function(){h=!0,Y(),p=$r(B,s),De.complete()}}),U(O).subscribe(u))})(c)}}function $r(e,t){for(var r=[],n=2;ne.next(document)),e}function K(e,t=document){return Array.from(t.querySelectorAll(e))}function z(e,t=document){let r=ce(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function ce(e,t=document){return t.querySelector(e)||void 0}function _e(){return document.activeElement instanceof HTMLElement&&document.activeElement||void 0}function tr(e){return L(b(document.body,"focusin"),b(document.body,"focusout")).pipe(ke(1),l(()=>{let t=_e();return typeof t!="undefined"?e.contains(t):!1}),V(e===_e()),J())}function Xe(e){return{x:e.offsetLeft,y:e.offsetTop}}function Kn(e){return L(b(window,"load"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>Xe(e)),V(Xe(e)))}function rr(e){return{x:e.scrollLeft,y:e.scrollTop}}function dt(e){return L(b(e,"scroll"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>rr(e)),V(rr(e)))}var Yn=function(){if(typeof Map!="undefined")return Map;function e(t,r){var n=-1;return t.some(function(o,i){return o[0]===r?(n=i,!0):!1}),n}return function(){function t(){this.__entries__=[]}return Object.defineProperty(t.prototype,"size",{get:function(){return this.__entries__.length},enumerable:!0,configurable:!0}),t.prototype.get=function(r){var n=e(this.__entries__,r),o=this.__entries__[n];return o&&o[1]},t.prototype.set=function(r,n){var o=e(this.__entries__,r);~o?this.__entries__[o][1]=n:this.__entries__.push([r,n])},t.prototype.delete=function(r){var n=this.__entries__,o=e(n,r);~o&&n.splice(o,1)},t.prototype.has=function(r){return!!~e(this.__entries__,r)},t.prototype.clear=function(){this.__entries__.splice(0)},t.prototype.forEach=function(r,n){n===void 0&&(n=null);for(var o=0,i=this.__entries__;o0},e.prototype.connect_=function(){!Wr||this.connected_||(document.addEventListener("transitionend",this.onTransitionEnd_),window.addEventListener("resize",this.refresh),va?(this.mutationsObserver_=new MutationObserver(this.refresh),this.mutationsObserver_.observe(document,{attributes:!0,childList:!0,characterData:!0,subtree:!0})):(document.addEventListener("DOMSubtreeModified",this.refresh),this.mutationEventsAdded_=!0),this.connected_=!0)},e.prototype.disconnect_=function(){!Wr||!this.connected_||(document.removeEventListener("transitionend",this.onTransitionEnd_),window.removeEventListener("resize",this.refresh),this.mutationsObserver_&&this.mutationsObserver_.disconnect(),this.mutationEventsAdded_&&document.removeEventListener("DOMSubtreeModified",this.refresh),this.mutationsObserver_=null,this.mutationEventsAdded_=!1,this.connected_=!1)},e.prototype.onTransitionEnd_=function(t){var r=t.propertyName,n=r===void 0?"":r,o=ba.some(function(i){return!!~n.indexOf(i)});o&&this.refresh()},e.getInstance=function(){return this.instance_||(this.instance_=new e),this.instance_},e.instance_=null,e}(),Gn=function(e,t){for(var r=0,n=Object.keys(t);r0},e}(),Jn=typeof WeakMap!="undefined"?new WeakMap:new Yn,Xn=function(){function e(t){if(!(this instanceof e))throw new TypeError("Cannot call a class as a function.");if(!arguments.length)throw new TypeError("1 argument required, but only 0 present.");var r=ga.getInstance(),n=new La(t,r,this);Jn.set(this,n)}return e}();["observe","unobserve","disconnect"].forEach(function(e){Xn.prototype[e]=function(){var t;return(t=Jn.get(this))[e].apply(t,arguments)}});var Aa=function(){return typeof nr.ResizeObserver!="undefined"?nr.ResizeObserver:Xn}(),Zn=Aa;var eo=new x,Ca=$(()=>k(new Zn(e=>{for(let t of e)eo.next(t)}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function he(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ye(e){return Ca.pipe(S(t=>t.observe(e)),g(t=>eo.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(()=>he(e)))),V(he(e)))}function bt(e){return{width:e.scrollWidth,height:e.scrollHeight}}function ar(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}var to=new x,Ra=$(()=>k(new IntersectionObserver(e=>{for(let t of e)to.next(t)},{threshold:0}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function sr(e){return Ra.pipe(S(t=>t.observe(e)),g(t=>to.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(({isIntersecting:r})=>r))))}function ro(e,t=16){return dt(e).pipe(l(({y:r})=>{let n=he(e),o=bt(e);return r>=o.height-n.height-t}),J())}var cr={drawer:z("[data-md-toggle=drawer]"),search:z("[data-md-toggle=search]")};function no(e){return cr[e].checked}function Ke(e,t){cr[e].checked!==t&&cr[e].click()}function Ue(e){let t=cr[e];return b(t,"change").pipe(l(()=>t.checked),V(t.checked))}function ka(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function Ha(){return L(b(window,"compositionstart").pipe(l(()=>!0)),b(window,"compositionend").pipe(l(()=>!1))).pipe(V(!1))}function oo(){let e=b(window,"keydown").pipe(A(t=>!(t.metaKey||t.ctrlKey)),l(t=>({mode:no("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),A(({mode:t,type:r})=>{if(t==="global"){let n=_e();if(typeof n!="undefined")return!ka(n,r)}return!0}),pe());return Ha().pipe(g(t=>t?M:e))}function le(){return new URL(location.href)}function ot(e){location.href=e.href}function io(){return new x}function ao(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)ao(e,r)}function _(e,t,...r){let n=document.createElement(e);if(t)for(let o of Object.keys(t))typeof t[o]!="undefined"&&(typeof t[o]!="boolean"?n.setAttribute(o,t[o]):n.setAttribute(o,""));for(let o of r)ao(n,o);return n}function fr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function so(){return location.hash.substring(1)}function Dr(e){let t=_("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Pa(e){return L(b(window,"hashchange"),e).pipe(l(so),V(so()),A(t=>t.length>0),X(1))}function co(e){return Pa(e).pipe(l(t=>ce(`[id="${t}"]`)),A(t=>typeof t!="undefined"))}function Vr(e){let t=matchMedia(e);return er(r=>t.addListener(()=>r(t.matches))).pipe(V(t.matches))}function fo(){let e=matchMedia("print");return L(b(window,"beforeprint").pipe(l(()=>!0)),b(window,"afterprint").pipe(l(()=>!1))).pipe(V(e.matches))}function zr(e,t){return e.pipe(g(r=>r?t():M))}function ur(e,t={credentials:"same-origin"}){return ue(fetch(`${e}`,t)).pipe(fe(()=>M),g(r=>r.status!==200?Ot(()=>new Error(r.statusText)):k(r)))}function We(e,t){return ur(e,t).pipe(g(r=>r.json()),X(1))}function uo(e,t){let r=new DOMParser;return ur(e,t).pipe(g(n=>n.text()),l(n=>r.parseFromString(n,"text/xml")),X(1))}function pr(e){let t=_("script",{src:e});return $(()=>(document.head.appendChild(t),L(b(t,"load"),b(t,"error").pipe(g(()=>Ot(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(l(()=>{}),R(()=>document.head.removeChild(t)),ge(1))))}function po(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function lo(){return L(b(window,"scroll",{passive:!0}),b(window,"resize",{passive:!0})).pipe(l(po),V(po()))}function mo(){return{width:innerWidth,height:innerHeight}}function ho(){return b(window,"resize",{passive:!0}).pipe(l(mo),V(mo()))}function bo(){return G([lo(),ho()]).pipe(l(([e,t])=>({offset:e,size:t})),X(1))}function lr(e,{viewport$:t,header$:r}){let n=t.pipe(ee("size")),o=G([n,r]).pipe(l(()=>Xe(e)));return G([r,t,o]).pipe(l(([{height:i},{offset:s,size:a},{x:f,y:c}])=>({offset:{x:s.x-f,y:s.y-c+i},size:a})))}(()=>{function e(n,o){parent.postMessage(n,o||"*")}function t(...n){return n.reduce((o,i)=>o.then(()=>new Promise(s=>{let a=document.createElement("script");a.src=i,a.onload=s,document.body.appendChild(a)})),Promise.resolve())}var r=class extends EventTarget{constructor(n){super(),this.url=n,this.m=i=>{i.source===this.w&&(this.dispatchEvent(new MessageEvent("message",{data:i.data})),this.onmessage&&this.onmessage(i))},this.e=(i,s,a,f,c)=>{if(s===`${this.url}`){let u=new ErrorEvent("error",{message:i,filename:s,lineno:a,colno:f,error:c});this.dispatchEvent(u),this.onerror&&this.onerror(u)}};let o=document.createElement("iframe");o.hidden=!0,document.body.appendChild(this.iframe=o),this.w.document.open(),this.w.document.write(` + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + + + + + + +

DaNews

+

Version: 1.0.0

+

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

+

license: Not publicly available.

+
+

DaNews consists of articles from Danish news and tabloid media from 1 December 2019 to +30 April 2021. The articles stem from multiple news sources, including both online of physical newspapers.

+

DaNews consists of 403 million tokens 93% were left after + quality filtering and deduplication.

+

Datasheet

+

Following the recommendation and framework of [5] we add the following datasheet.

+

Motivation

+

For what purpose was the dataset created? Who created the dataset? Who funded the +creation of the dataset?

+

DANews was collected as a part of the HOPE project, examining news coverage during the COVID-19 pandemic. The purpose was to train a model to understand how the novelty and resonance imprint of COVID-19 as a case of crisis compared to non-crises news imprints.

+

Any other comments?

+

No.

+

Composition

+

How many instances are there in total (of each type, if appropriate)?

+

The unfiltered dataset consists of 713 429 documents including a total of 403 089 625 tokens.

+

What do the instances that comprise the dataset represent (e.g., documents, photos, +people, countries)?

+

Instances of the dataset are Danish articles derived from Danish tabloids or news media.

+

Does the dataset contain all possible instances or is it a sample (not necessarily +random) of instances from a larger set?

+

Prior to filtering DaNews dataset contains all digitized news articles from the given +period across the sources.

+

What data does each instance consist of? “Raw” data (e.g., unprocessed text or images) +or features? In either case, please provide a description.

+

Each instance consists of the following columns +

'ArticleUrl', 'Heading', 'SubHeading', 'Lead', 'Paragraph', 'PublishDate', 'BodyText', 
+'Captions', 'Authors', 'Source', 'WordCount', 'ArticleId', 'PageIds', 'Section', 'text'
+

+

Where we constructed the columns text column by joining the Heading, SubHeading +using newline. If the text field is empty it is ignored and no newline is added. The we +join the resulting string with the BodyText using two newlines.

+

During the quality filtering, we add the following indicator columns: +

'passed_quality_filter', 'filtered_by_max_chr_length', 'filtered_by_doc_length', 
+'filtered_by_mean_word_length', 'filtered_by_alpha_ratio', 'filtered_by_stop_word', 
+'filtered_by_symbol_2_word_hashtag', 'filtered_by_symbol_2_word_ellipsis',
+'filtered_by_line_bullets_or_ellipsis', 'filtered_by_duplicate_lines_chr_fraction',
+'filtered_by_duplicate_paragraph_chr_fraction', 'filtered_by_top_ngram_chr_fraction',
+'filtered_by_duplicate_ngram_chr_fraction', 'is_duplicate'
+

+

Is there a label or target associated with each instance? If so, please provide a +description.

+

No.

+

Is any information missing from individual instances? If so, please provide a +description, explaining why this information is missing (e.g., because it was +unavailable). This does not include intentionally removed information but might +include, e.g., redacted text.

+

The team of researchers at the Humanities Computing Aarhus (CHCAA) have not +removed any information from the instances.

+

Are relationships between individual instances made explicit (e.g., users’ movie +ratings, and social network links)? If so, please describe how these relationships are made +explicit.

+

The metadata columns denote the relationship between articles including the date of +publication, sections, and authors.

+

Are there recommended data splits (e.g., training, development/validation, testing)? +If so, please provide a description of these splits, explaining the rationale behind +them.

+

There are not splits performed on this dataset.

+

Are there any errors, sources of noise, or redundancies in the dataset? If so, please +provide a description.

+

News sources can publish their content both in an online and printed format which would +lead to similar instances in the dataset. To alleviate this redundancy by removing +near-duplicates (see Preprocessing/cleaning/labeling).

+

Is the dataset self-contained, or does it link to or otherwise rely on external +resources (e.g., websites, tweets, other datasets)?

+

Articles are intended to tell a self-contained story but can include external +references such as tweets or website URLs.

+

Does the dataset contain data that, if viewed directly, might be offensive, insulting, +threatening, or might otherwise cause anxiety?

+

Articles often describe content that is considered offensive, insulting, or threatening.

+

Collection Process

+

What mechanisms or procedures were used to collect the data (e.g., hardware + apparatuses or sensors, manual human curation, software programs, software APIs)?

+

A team of researchers at the Center for Humanities Computing Aarhus (CHCAA) obtained this + dataset using a third-party API as well as a manual transfer from one of the parties. The API was limited + to only a subset of articles agreed upon within the agreements.

+

If the dataset is a sample from a larger set, what was the sampling strategy?

+

The dataset is not a sample, but is a filtered version of the full dataset, see +Preprocessing/cleaning/labeling for more on this.

+

Who was involved in the data collection process? +A team of researchers at the Center for Humanities Computing Aarhus (CHCAA) obtained this +dataset using a third party API as well as a manual transfer from some of the parties and would like to thank the dataset owners for + access to their articles.

+

Over what timeframe was the data collected?

+

The dataset includes articles from 1 December 2019 to +30 April 2021.

+

Were any ethical review processes conducted?

+

No.

+

Preprocessing/cleaning/labeling

+

Was any preprocessing/Cleaning/Labeling of the data done +(e.g., discretization or bucketing, tokenization, part-of-speech tagging, +SIFT feature extraction, removal of instances, processing of missing values)?

+

DaNews has been filtered using a series of heuristic filters as well as removing +repetitious texts. Following the filtering, DaNews is deduplicated to remove exact and +near-duplicates.

+

Of all documents, 9% were filtered based due to low-quality and 4% because they were near-duplicates.

+

For quality filtering, DaNews applies a filter akin to [2] which contains text +that:

+
    +
  • Contain at least 2 Danish stopwords. For the stopword list we use the one used in +SpaCy v.3.1.4.
  • +
  • Have a mean word length between 3 and 10.
  • +
  • Have a token length between 50 and 100,000.
  • +
  • Have less than 5,000,000 characters.
  • +
  • Have less than 60% of words containing an alphabetic character.
  • +
  • Have a symbol-to-word ratio lower than 10% for hashtags and ellipsis.
  • +
  • Have less than 90% of lines starting with a bullet point.
  • +
  • +

    have less than 30% of lines ending with an ellipsis.

    +
  • +
  • +

    Have a low high degree of repetitious text:

    +
  • +
  • Have less than 20% of characters contained within duplicate lines.
  • +
  • Have less than 20% of characters contained within duplicate paragraphs.
  • +
  • Where the top 2-4 grams constitute less than 20%, 18%, 16%, respectively, of the text.
  • +
  • Where the duplicate 5-10 grams constitute less than 25%, 24%, 23%, 22%, 21%, 20% +of the text, respectively.
  • +
+

The deduplication removed all documents with a 13-gram Jaccard similarity higher than 80% +following the MinHash algorithm [1] using 128 permutations. The MinHash algorithm is a +probabilistic data structure for approximating the Jaccard similarity between two sets.

+

Is the software used to preprocess/clean/label the instances available?

+

Yes, the scripts are available +here. +the scripts use version 0.0.2 of the +dfm package.

+

Uses

+

Has the dataset been used for any tasks already?

+

Yes, the dataset has been used to pre-train Danish language models. +Parts of the dataset have also been used in [3] and [4]

+

Is there a repository that links to any or all papers or systems that use the dataset?

+

No.

+

What (other) tasks could the dataset be used for?

+

The scale of the dataset makes it suitable for NLP tasks such as language modeling. +Similarly, the structure of the articles makes it a suitable dataset for training text +summarisation models.

+

Is there anything about the composition of the dataset or the way it was collected and +preprocessed/cleaned/labeled that might impact future uses?

+

This dataset is static and thus does not evolve over time with the language. +A consequence of this is that it will become increasingly outdated over time.

+

Are there tasks for which the dataset should not be used?

+

This dataset contains Danish articles and thus should not be used for non-Danish +language tasks.

+

As the writers of the content are predominantly journalists, it reflects a certain +writing style which is unlikely to reflect the Danish language as a whole.

+

Distribution

+

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

+

Data will only be available at the entity during the project. If you wish access to the dataset you will have to come to an agreement with the individuals +Danish newspapers.

+

Citation

+

If you wish to cite this work please see our GitHub page for an up-to-date citation: +https://github.com/centre-for-humanities-computing/danish-foundation-models

+

References:

+
    +
  • [1] Broder, Andrei Z. "On the resemblance and containment of documents." + Proceedings. Compression and Complexity of SEQUENCES 1997 + (Cat. No. 97TB100171). IEEE, 1997.
  • +
  • [2] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., + Aslanides, J., Henderson, S., Ring, R., Young, S., Rutherford, E., Hennigan, + T., Menick, J., Cassirer, A., Powell, R., Driessche, G. van den, Hendricks, + L. A., Rauh, M., Huang, P.-S., … Irving, G. (2021). + Scaling Language Models: Methods, Analysis & Insights from Training Gopher. + https://arxiv.org/abs/2112.11446v2
  • +
  • [3] Baglini, R. B., Nielbo, K. L., Hæstrup, F., Enevoldsen, K., Vahlstrup, P. B., & + Roepstorff, A. (2021, June 2). When no news is bad news: Detection of negative + events from news media content. https://2021.dhbenelux.org/
  • +
  • [4] Nielbo, K. L., Baglini, R. B., Vahlstrup, P. B., Enevoldsen, K. C., Bechmann, A., + & Roepstorff, A. (2021, January). News information decoupling: An information + signature of catastrophes in legacy news media. https://eadh2020-2021.org/
  • +
  • [5] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daumé III, + and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
  • +
+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/datasheets/daradio/index.html b/datasheets/daradio/index.html new file mode 100644 index 00000000..8aba6189 --- /dev/null +++ b/datasheets/daradio/index.html @@ -0,0 +1,762 @@ + + + + + + + + + + + + + + + + + + + + DaRadio - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + + + + + + +

DaRadio Datasheet

+

Version: 1.0.0

+

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

+

License: Not publicly available.

+
+

DaRadio consists of radio broadcasts from the Danish radio stations DR P1 and Radio24Syv, and contains approximately 140.000 hours of speech. DaRadio includes all shows aired on DR P1 from 2005 to 2021, and all shows aired on Radio24Syv from 2011 to 2019.

+

DaRadio has been deduplicated using a series of heuristics based on metadata. For more on deduplication, see the data cleaning section further below.

+

Datasheet

+

Following the recommendation and framework of [1], we add the following datasheet.

+

Motivation:

+

For what purpose was the dataset created? Who created the dataset? Who funded the creation of the dataset?

+

Data included in DaRadio was collected following the Danish Legal Deposit Act by the Royal Danish Library (RDL). From this, a dataset of Danish speech-only radio was derived by RDL. The dataset was created for research purposes, including training a Danish wav2vec2.0 model.

+

The dataset was preprocessed to remove duplicates by a team of researchers at the Center for Humanities Computing, Aarhus University (CHC) with collaborators from the Danish speech-processing company Alvenir.

+

Composition

+

What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)?

+

Instances of the dataset include an mp3 file for each show aired on the two staions within the period. Further metadata include information on date and time of airing, title, short description of the show, and various internal identifiers used by RDL.

+

How many instances are there in total (of each type, if appropriate)?

+

DaRadio consists of a total of 215.582 hours of unprocessed Danish speech radio shows across two stations, DR P1 and Radio24syv. The table below shows the distribution over the stations with and without heuristic rerun removal.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SourceDuration (hours)Reruns removed
P1145.160False
97.401True
Radio24syv70.422False
44.569True
+

Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set?

+

The dataset contains all shows from the two stations in the time period (2005-2021 for DR P1 and 2011-2019 for Radio24syv).

+

If the dataset is a sample from a larger set, what was the sampling strategy?

+

The dataset is a subset of all Danish radio. The two stations were chosen for the dataset as they are talk-radio only.

+

Who was involved in the data collection process?

+

The RDL collects Danish radio shows and constructed DaRadio for handing to researchers at CHC.

+

Over what timeframe was the data collected?

+

The dataset includes radio shows from the period 2005 to 2021.

+

Were any ethical review processes conducted?

+

The RDL collects radio shows in adherence to Danish Archival laws. DaRadio was constructed for a research project, for which a project proposal was accepted by RDL. No other ethical review processes were conducted.

+

Preprocessing/cleaning/labeling

+

Was any preprocessing/Cleaning/Labeling of the data done +(e.g., discretization or bucketing, tokenization, part-of-speech tagging, +SIFT feature extraction, removal of instances, processing of missing values)?

+

DaRadio has been deduplicated using a series of heuristic filters and all files have been converted to 16 Khz .wav files.

+

Reruns/duplicates were identified by the following rules:

+
    +
  • If the phrase "sendt første gang" ["aired the first time"] or "genudsendelse" ["rerun"] appeared in the show description.
  • +
  • If the title contained "(G)" (short for "genudsendelse"))
  • +
  • If the show was broadcast between 23:00 and 5:00.
  • +
+

The deduplication was coded and conducted by researchers at CHC.

+

Is the software used to preprocess/clean/label the instances available?

+

The scripts are available at the following GitHub repository: link.

+

Uses

+

Has the dataset been used for any tasks already?

+

Yes, the dataset has been used to pre-train a Danish wav2vec2.0 model.

+

Is there a repository that links to any or all papers or systems that use the dataset?

+

No, but as of 23/10/16 no others have used the dataset.

+

What (other) tasks could the dataset be used for?

+

As the dataset only contains un-labelled data, i.e. no transcriptions, it is mainly designed for pre-training language models. However, given the metadata and re-occuring hosts, further processing might make it possible to train e.g. text-to-speech systems.

+

Is there anything about the composition of the dataset or the way it was collected and +preprocessed/cleaned/labeled that might impact future uses?

+

This dataset is static and does not evolve over time with the language, thus will become increasingly outdated over time.

+

Distribution

+

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

+

Data will only be available at the entity during the project. An equivalent or updated dataset can be requested at the Royal Danish Library.

+

Citation

+

If you wish to cite this work please see our GitHub page for an up to date citation: https://github.com/centre-for-humanities-computing/danish-foundation-models

+

References:

+
    +
  • [1] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daumé III, + and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
  • +
+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/datasheets/hopetwitter/index.html b/datasheets/hopetwitter/index.html new file mode 100644 index 00000000..5cbb221b --- /dev/null +++ b/datasheets/hopetwitter/index.html @@ -0,0 +1,844 @@ + + + + + + + + + + + + + + + + + + + + + + HopeTwitter - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + + + + + + +

HopeTwitter

+

Version: 1.0.0

+

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

+

license: Not publicly available.

+
+

HopeTwitter consists of tweets collected from the Twitter API using a stopword list +and consists of 32.5 million tweets across 538,398 unique users. HopeTwitter includes +tweets from 2019-01-01 to 2021-04-30.

+

HopeTwitter, have been filtered to only include Danish tweets, based on language tag from Twitter API. Similarly, HopeTwitter +have had low-quality tweets have removed and then deduplicated to remove exact and +near-duplicates. For more on data cleaning see section; +"Preprocessing/cleaning/labeling".

+

HopeTwitter includes a total of 0.97 billion tokens before filtering and includes 0.48 +billion (50%) after.

+

Datasheet

+

Following the recommendation and framework of [3] we add the following datasheet.

+

Motivation

+

**For what purpose was the dataset created? Who created the dataset? Who funded the +creation of the dataset? **

+

HopeTwitter was initially collected as a part of the +HOPE project, examining societal behaviour during the +COVID-19 pandemic. Next, HopeTwitter was cleaned in preparation for pre-training Danish language +models by a team of researchers at Center for Humanities Computing Aarhus +(CHCAA), using +a codebase jointly developed with partners from academia and industry, including KMD, +Ekstra Bladet, Bristol University and Deepdivr. For more on collaborators on this +project see the +GitHub repository.

+

Any other comments?

+

No.

+

Composition

+

What do the instances that comprise the dataset represent (e.g., documents, photos, +people, countries)?

+

HopeTwitter consists of tweets containing at least one of a series of stopwords, +collected through the Twitter API. See "If the dataset is a sample from a larger set, +what was the sampling strategy?" for the stopword list.

+

How many instances are there in total (of each type, if appropriate)?

+

The dataset consist of 32,499,019 documents where 14,399,284 (44%) were considered +duplicates.

+

Does the dataset contain all possible instances or is it a sample (not necessarily +random) of instances from a larger set?

+

No. It does not contain all instances of Danish Twitter as there are likely some Danish +tweets which does not include a stopword.

+

Is there a label or target associated with each instance? If so, please provide a +description.

+

No.

+

Are there recommended data splits (e.g., training, development/validation, testing)? +If so, please provide a description of these splits, explaining the rationale behind +them.

+

No splits are performed on this dataset.

+

If the dataset is a sample from a larger set, what was the sampling strategy?

+

Tweets are streamed continuously using queried a set of the highest +frequency Scandinavian-specific keywords from Danish, Norwegian (Bokmål) and Swedish, +resulting in the following list: +

aften, aldrig, alltid, altid, andet, arbejde, bedste, behöver, behøver, beklager,
+berätta, betyr, blev, blevet, blir, blitt, blive, bliver, bruge, burde, bättre, båe
+bør, deim, deires, ditt, drar, drepe, dykk, dykkar, där, död, döda, død, døde, efter,
+elsker, endnu, faen, fandt, feil, fikk, finner, flere, forstår, fortelle, fortfarande,
+fortsatt, fortælle, från, få, fået, får, fått, förlåt, första, försöker, før, først,
+første, gick, gikk, gillar, gjennom, gjerne, gjorde, gjort, gjør, gjøre, godt, gå, gång,
+går, göra, gør, gøre, hadde, hallå, havde, hedder, helt, helvete, hende, hendes, hennes,
+herregud, hjelp, hjelpe, hjem, hjälp, hjå, hjælp, hjælpe, honom, hossen, hvem, hvis,
+hvordan, hvorfor, händer, här, håll, håller, hør, høre, hører, igjen, ikkje, ingenting,
+inkje, inte, intet, jeres, jävla, kanske, kanskje, kender, kjenner, korleis, kvarhelst,
+kveld, kven, kvifor, känner, ledsen, lenger, lidt, livet, längre, låt, låter, længe,
+meget, menar, mycket, mykje, må, måde, många, mår, måske, måste, måtte, navn, nogen,
+noget, nogle, noko, nokon, nokor, nokre, någon, något, några, nån, når, nåt, nødt,
+också, også, pengar, penger, pratar, prøver, på, redan, rundt, rätt, sagde, saker,
+samma, sammen, selv, selvfølgelig, sidan, sidste, siger, sikker, sikkert, själv, skete,
+skjedde, skjer, skulle, sluta, slutt, snakke, snakker, snill, snälla, somt, stadig,
+stanna, sted, står, synes, säger, sätt, så, sådan, såg, sånn, tager, tiden, tilbage,
+tilbake, tillbaka, titta, trenger, trodde, troede, tror, två, tycker, tänker, uden,
+undskyld, unnskyld, ursäkta, uten, varför, varit, varte, veldig, venner, verkligen,
+vidste, vilken, virkelig, visste, väg, väl, väldigt, vän, vår, våra, våre, væk, vær, 
+være, været, älskar, åh, år, åt, över
+

+

Who was involved in the data collection process?

+

A team of researchers at the Center for Humanities +Computing Aarhus (CHCAA), including Kristoffer Nielbo and Peter Bjerregaard Vahlstrup, in collaboration with Rebekah Baglini, at the School of Communcation and Culture at Aarhus university.

+

Over what timeframe was the data collected?

+

The dataset includes tweets from the period 2019-01-01 to 2021-04-30.

+

Were any ethical review processes conducted?

+

No

+

Preprocessing/cleaning/labeling

+

Was any preprocessing/Cleaning/Labeling of the data done +(e.g., discretization or bucketing, tokenization, part-of-speech tagging, +SIFT feature extraction, removal of instances, processing of missing values)?

+

Firstly, HopeTwitter had non-Danish tweets removed, after which a series of +heuristic filters were applied, including the removal of repetitious texts. Following the filtering, +HopeTwitter was deduplicated, removing both exact duplicates and near-duplicates.

+

Of all documents, 3,023,427 (9%) were filtered due to low-quality and +14,399,284 (33%) because they were near-duplicates.

+

For the quality filtering, HopeTwitter applies a filter akin to [2] which contains text +that:

+
    +
  • Contain at least 2 Danish stopwords. For the stopword list we use the one used in +SpaCy v.3.1.4.
  • +
  • Have a mean word length between 2 and 14.
  • +
  • Have a token length between 10 and 100,000.
  • +
  • Have less than 5,000,000 characters.
  • +
  • +

    Have less than 60% of words containing an alphabetic character.

    +
  • +
  • +

    Have low high degree of repetitious text:

    +
  • +
  • Have less than 20% of characters contained within duplicate lines.
  • +
  • Have less than 20% of characters contained within duplicate paragraphs.
  • +
  • Where the top 2-4 grams constitute less than 20%, 18%, 16%, respectively, of the text.
  • +
  • Where the duplicate 5-10 grams constitute less than 25%, 24%, 23%, 22%, 21%, 20% +of the text, respectively.
  • +
+

The deduplication removed all documents with a 10-gram Jaccard similarity higher than 80% +following the MinHash algorithm [1] using 128 permutations. The MinHash algorithm is a +probabilistic data structure for approximating the Jaccard similarity between two sets.

+

Is the software used to preprocess/clean/label the instances available?

+

Yes, the scripts are available +here. +The scripts use version 0.0.2 of the +dfm package.

+

Uses

+

Has the dataset been used for any tasks already?

+

Yes, the dataset has been used to pre-train Danish language models. +Parts of the dataset have also been used in HOPE project reports +and in [4].

+

Is there a repository that links to any or all papers or systems that use the dataset?

+

There is a website for the HOPE project for which the dataset was initially collected. This website contains report and articles regarding the dataset.

+

What (other) tasks could the dataset be used for?

+

The scale of the dataset makes it suitable for NLP tasks such as language modelling. +Similarly, one could imagine using the conversation structure could be used to train +conversational chatbots.

+

Is there anything about the composition of the dataset or the way it was collected and +preprocessed/cleaned/labeled that might impact future uses?

+

This dataset is static and thus does not evolve over time with the language. +A consequence of this is that it will become increasingly outdated over time. However, +it possible to extend the dataset by a continual collection of tweets.

+

Are there tasks for which the dataset should not be used?

+

HopeTwitter contains Danish tweets and thus should not be used for non-Danish language tasks.

+

As the writers of the content is predominantly journalists, politicians, influencers, +and academics, it reflects a certain social group which is unlikely to reflect Danish +population as a whole.

+

Distribution

+

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

+

Data will only be available at the entity during the project. After the project the data will be archived for a period of five years to comply with the university [policy] for research integrity. After the five years, the data will be registered at the national archives as required by executive order 514 for potential long-term deposit.

+

Citation

+

If you wish to cite this work please see our GitHub page for an up to date citation: +https://github.com/centre-for-humanities-computing/danish-foundation-models

+

References:

+
    +
  • [1] Broder, Andrei Z. "On the resemblance and containment of documents." + Proceedings. Compression and Complexity of SEQUENCES 1997 + (Cat. No. 97TB100171). IEEE, 1997.
  • +
  • [2] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., + Aslanides, J., Henderson, S., Ring, R., Young, S., Rutherford, E., Hennigan, + T., Menick, J., Cassirer, A., Powell, R., Driessche, G. van den, Hendricks, + L. A., Rauh, M., Huang, P.-S., … Irving, G. (2021). + Scaling Language Models: Methods, Analysis & Insights from Training Gopher. + https://arxiv.org/abs/2112.11446v2
  • +
  • [3] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daumé III, + and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
  • +
  • [4] Johansen, N., Marjanovic, S. V., Kjaer, C. V., Baglini, R. B., & Adler-Nissen, R. + (2022). Ridiculing the “tinfoil hats:” Citizen responses to COVID-19 misinformation + in the Danish facemask debate on Twitter. Harvard Kennedy School Misinformation + Review. https://doi.org/10.37016/mr-2020-93
  • +
+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/datasheets/netarkivet_text/index.html b/datasheets/netarkivet_text/index.html new file mode 100644 index 00000000..26e7f92a --- /dev/null +++ b/datasheets/netarkivet_text/index.html @@ -0,0 +1,600 @@ + + + + + + + + + + + + + + + + + + + + + + NAT - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + + + + + + +

NAT: Netarkivet Text

+

Version: 1.0.0

+

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

+

license: Not publicly available.

+
+

This datasheet is currently being revised 🛠️

+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/dcc/index.html b/dcc/index.html new file mode 100644 index 00000000..4bd2e6f6 --- /dev/null +++ b/dcc/index.html @@ -0,0 +1,670 @@ + + + + + + + + + + + + + + + + + + + + + + DCC - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + + + + + + +

DCC v1

+

The DCC is a composite corpus consisting of the following subcorpora. For more information about the specific subcorpora, feel free to check out the individual datasheets.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescriptionSizeOpen AccessNovel Corpus
Text
DAGWDanish Gigaword1B tokens
reddit-daDanish Reddit<.1B tokens
HopeTwitterDanish Tweets0.48B tokens
DaNewsDanish newspapers0.5B tokens
Netarkivet TextDanish internet>100B tokens
Speech
DaRadioDanish talk radio140,000 hours
DaTVDanish subtitled TV900 hours
+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/desc_stats/netarkivet_sites_collected_pr_month.png b/desc_stats/netarkivet_sites_collected_pr_month.png new file mode 100644 index 00000000..352beec6 Binary files /dev/null and b/desc_stats/netarkivet_sites_collected_pr_month.png differ diff --git a/desc_stats/netarkivet_top_50_domains.png b/desc_stats/netarkivet_top_50_domains.png new file mode 100644 index 00000000..5c1ea35e Binary files /dev/null and b/desc_stats/netarkivet_top_50_domains.png differ diff --git a/index.html b/index.html new file mode 100644 index 00000000..b6765e2c --- /dev/null +++ b/index.html @@ -0,0 +1,686 @@ + + + + + + + + + + + + + + + + + + + + Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + + + + + + +

Empowering the Danish Language in the Digital Age

+

Welcome to the Danish Foundation Models (DFM) project, a pioneering initiative in the field of machine learning and natural language processing (NLP) dedicated to the Danish language. Our mission is to develop, maintain, and provide open access to high-quality foundation models tailored for Danish, promoting innovation and inclusivity in language technologies.

+

Why Danish Foundation Models?

+

Bridging the Digital Language Divide

+
    +
  • Global Gap: The rise of large language models has transformed research and technology, but smaller languages like Danish risk falling behind both in development, evaluation and application.
  • +
  • Local Focus: We combat this by focusing on the Danish language, ensuring that it is well-represented in the digital landscape.
  • +
  • Broad Collaboration: Our project unites public and private institutions, ensuring high data quality and practical applicability of our models.
  • +
+

Our Objectives

+
    +
  1. To develop and maintain state-of-the-art language models for Danish for applications within both text and speech.
  2. +
  3. To extensively validate foundation models for Danish in a representative set of tasks.
  4. +
  5. To maintain a high standard of documentation of models such as model cards [Mitchell et al., 2019] and datasheets [Gebru et al., 2021].
  6. +
  7. To open-source not only the models but also all components required for reproducibility such as pre-processing, training, and validation code.
  8. +
+

You can read more about the argument for Danish Language models in our publication.

+

Open-source models on closed-source data

+

As many of the datasets we use either contain personally sensitive information or fall under copyright they can't be shared publicly. However, we want to share as +much as possible from the project, while protecting privacy and adhering to copyright law. +Thus we organize it such that all parts of the project that can be shared and those which +can't are well-documented using datasheets and training logs. +Furthermore, during data processing and training, the data is stored on UCloud which follows the highest standards of information security management with a formal ISO27001 certification.

+

+

Join Us

+

We invite collaboration and contributions from industry professionals, researchers, and the open-source community. Together, we can advance the field of Danish NLP and create a more inclusive digital future. You can reach out to us using the following channels:

+ + + + + + + + + + + + + + + + + + + + + +
🗣 DDSC SlackJoin the discussion in the "danish-foundation-models-text"-channel
💬 GitHub DiscussionAsk questions or start a discussion
🚨 GitHub IssuesNoticed a bug in the code? Please create an issue
+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/intercoder_reliability/index.html b/intercoder_reliability/index.html new file mode 100644 index 00000000..30550679 --- /dev/null +++ b/intercoder_reliability/index.html @@ -0,0 +1,648 @@ + + + + + + + + + + + + + + + + + + Results from corpus tagging - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + + + + + + +

Results from corpus tagging

+

Each user tagged 100 documents unless otherwise specified. Documents were split by newlines into text-blocks, block was rated. +Text-blocks longer than 1000 characters were split into multiple blocks of 1000 characters or less.

+

This tagging scheme is similar to +(Kreutzer et al., 2022).

+

Each block was put into one of the following categories: +Each user tagged 100 documents (unless otherwise specified). Each document were tagged

+
    +
  • wrong_language: Not Danish
  • +
  • skipped: Unsure of category
  • +
  • correct_language: Danish text where at least 80% of the text is reasonable.
  • +
  • not_language: Text where less than 80% of the text is reasonable. Takes priority over wrong_language.
  • +
+

Additionally, each block was tagged for pornography (yes/no) and offensiveness (yes/no).

+

Text proportions

+
+

Kenneth (Session: test)

+
    +
  • Date: 2022-09-05
  • +
  • Sentences tagged: 102
  • +
  • Documents tagged: na
  • +
+

Proportions:

+
    +
  • 69.16% of characters is correct_language
  • +
  • 25.66% of characters is not_language
  • +
  • 2.74% of characters is skipped
  • +
  • 2.45% of characters is wrong_language
  • +
  • 0.00% of characters is porn
  • +
  • 0.00% of characters is offensive
  • +
+

Kenneth (Session: 1)

+
    +
  • Date: 2022-09-06
  • +
  • Sentences tagged: 292
  • +
  • Documents tagged: 100
  • +
+

Proportions:

+
    +
  • 68.03% of characters is correct_language
  • +
  • 29.19% of characters is not_language
  • +
  • 2.10% of characters is skipped
  • +
  • 0.68% of characters is wrong_language
  • +
  • 0.00% of characters is porn
  • +
  • 1.38% of characters is offensive
  • +
+

Lasse (Session: 1)

+
    +
  • Date: 2022-09-07
  • +
  • Sentences tagged: 336
  • +
  • Documents tagged: 100
  • +
+

Proportions:

+
    +
  • 68.02% of characters is correct_language
  • +
  • 30.97% of characters is not_language
  • +
  • 1.01% of characters is wrong_language
  • +
  • 0.26% of characters is porn
  • +
  • 0.00% of characters is offensive
  • +
+

Intercoder Reliability

+
+

Kenneth (Session: test) vs Kenneth - (Session: 1)

+
    +
  • +

    Cohen's Kappa (all categories): 0.8242 (Overlap in sentences: 98)

    +
  • +
  • +

    Cohen's Kappa (correct_language vs not correct_language): 0.9075 (Overlap in sentences: 98)

    +
  • +
+

Kenneth (Session: test) vs Lasse - (Session: 1)

+
    +
  • +

    Cohen's Kappa (all categories): 0.8140 (Overlap in sentences: 95)

    +
  • +
  • +

    Cohen's Kappa (correct_language vs not correct_language): 0.8389 (Overlap in sentences: 95)

    +
  • +
+

Kenneth (Session: 1) vs Lasse - (Session: 1)

+
    +
  • +

    Cohen's Kappa (all categories): 0.6767 (Overlap in sentences: 245)

    +
  • +
  • +

    Cohen's Kappa (correct_language vs not correct_language): 0.7259 (Overlap in sentences: 245)

    +
  • +
+

Comparison with mC4

+
+

Note: mC4 did have a high degree of repititious texts. Similarly it did when texts blocks where not language they were often something like:

+
2lineStart%22%3A%22%22%2C%22placeholder%22%3A1%2C%22extName%22%3A%22nowiki%22%7D"" class=""placeholder placeholder-ext"" contenteditable=""false"">]&#x200b;</span></a></sup>&#x200b;</span>, at en lurifaks som Jimmy page, bruger MIT navn til opfindelsen! SV<span data-rte-instance=""1524-12953202845f3523698f3f1"" data-rte-meta=""%7B%22type%22%3A%22ext%22%2C%22wikitext%22%3A%22%3Cref%3ESVIN%3C%5C%2Fref%3E%22%2C%22lineStart%22%3A%22%22%2C%22placeholder%22%3A1%2C%22extName%22%3A%22ref%22%7D"" class=""placeholder placeholder-ext"" contenteditable=""false""><sup data-rte-washtml=""1"" id=""cite_ref-2"" class=""reference"" data-rte-attribs=""
+
+

While non-language texts in NAT was often menu bars, contact information, or navigation.

+

Kenneth (Session: 1)

+
    +
  • Date: 2022-09-06
  • +
  • Sentences tagged: 325
  • +
  • Documents tagged: 100
  • +
+

Proportions:

+
    +
  • 62.47% of characters is correct_language
  • +
  • 34.88% of characters is not_language
  • +
  • 1.27% of characters is skipped
  • +
  • 1.38% of characters is wrong_language
  • +
  • 3.25% of characters is porn
  • +
  • 0.00% of characters is offensive
  • +
+ + + + + + +
+
+ + + + +
+ + + +
+ +
+ + + + +
+ +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/models/index.html b/models/index.html new file mode 100644 index 00000000..0de64f03 --- /dev/null +++ b/models/index.html @@ -0,0 +1,677 @@ + + + + + + + + + + + + + + + + + + + + + + Models - Danish Foundation Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + + +
+
+
+ + + + +
+
+ + + + + + + + + + + + +

Models

+ +

This section gives an overview of the models available through the DFM project. The models are available through the Huggingface model hub. To avoid duplicating information surrounding the models and the information regarding the models are available at the models model sheet.

+

Text Models

+ + + + + + + + + + + + + + + + + + + + + + + + + +
ModelModel typeSize (parameters)
dfm-encoder-large-v1Encoderlarge (355M)
dfm-encoder-medium-v1Encodermedium (110M)
dfm-encoder-small-v1Encodersmall (22M)
+

Speech Models

+ + + + + + + + + + + + + + + + + + + + + +
ModelModel type
xls-r-300m-danishPretrained wav2vec2.0 model
xls-r-300m-danish-nst-cv9Automatic speech recognition
chcaa/xls-r-300m-nst-cv9-daAutomatic speech recognition
+ + + + + + +
+
+ + + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 00000000..44699182 --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Empowering the Danish Language in the Digital Age","text":"

Welcome to the Danish Foundation Models (DFM) project, a pioneering initiative in the field of machine learning and natural language processing (NLP) dedicated to the Danish language. Our mission is to develop, maintain, and provide open access to high-quality foundation models tailored for Danish, promoting innovation and inclusivity in language technologies.

"},{"location":"#why-danish-foundation-models","title":"Why Danish Foundation Models?","text":""},{"location":"#bridging-the-digital-language-divide","title":"Bridging the Digital Language Divide","text":"
  • Global Gap: The rise of large language models has transformed research and technology, but smaller languages like Danish risk falling behind both in development, evaluation and application.
  • Local Focus: We combat this by focusing on the Danish language, ensuring that it is well-represented in the digital landscape.
  • Broad Collaboration: Our project unites public and private institutions, ensuring high data quality and practical applicability of our models.
"},{"location":"#our-objectives","title":"Our Objectives","text":"
  1. To develop and maintain state-of-the-art language models for Danish for applications within both text and speech.
  2. To extensively validate foundation models for Danish in a representative set of tasks.
  3. To maintain a high standard of documentation of models such as model cards [Mitchell et al., 2019] and datasheets [Gebru et al., 2021].
  4. To open-source not only the models but also all components required for reproducibility such as pre-processing, training, and validation code.

You can read more about the argument for Danish Language models in our publication.

"},{"location":"#open-source-models-on-closed-source-data","title":"Open-source models on closed-source data","text":"

As many of the datasets we use either contain personally sensitive information or fall under copyright they can't be shared publicly. However, we want to share as much as possible from the project, while protecting privacy and adhering to copyright law. Thus we organize it such that all parts of the project that can be shared and those which can't are well-documented using datasheets and training logs. Furthermore, during data processing and training, the data is stored on UCloud which follows the highest standards of information security management with a formal ISO27001 certification.

"},{"location":"#join-us","title":"Join Us","text":"

We invite collaboration and contributions from industry professionals, researchers, and the open-source community. Together, we can advance the field of Danish NLP and create a more inclusive digital future. You can reach out to us using the following channels:

\ud83d\udde3 DDSC Slack Join the discussion in the \"danish-foundation-models-text\"-channel \ud83d\udcac GitHub Discussion Ask questions or start a discussion \ud83d\udea8 GitHub Issues Noticed a bug in the code? Please create an issue"},{"location":"dcc/","title":"DCC v1","text":"

The DCC is a composite corpus consisting of the following subcorpora. For more information about the specific subcorpora, feel free to check out the individual datasheets.

Name Description Size Open Access Novel Corpus Text DAGW Danish Gigaword 1B tokens \u2713 \u2717 reddit-da Danish Reddit <.1B tokens \u2713 \u2717 HopeTwitter Danish Tweets 0.48B tokens \u2717 \u2713 DaNews Danish newspapers 0.5B tokens \u2717 \u2713 Netarkivet Text Danish internet >100B tokens \u2717 \u2713 Speech DaRadio Danish talk radio 140,000 hours \u2717 \u2713 DaTV Danish subtitled TV 900 hours \u2717 \u2713"},{"location":"intercoder_reliability/","title":"Results from corpus tagging","text":"

Each user tagged 100 documents unless otherwise specified. Documents were split by newlines into text-blocks, block was rated. Text-blocks longer than 1000 characters were split into multiple blocks of 1000 characters or less.

This tagging scheme is similar to (Kreutzer et al., 2022).

Each block was put into one of the following categories: Each user tagged 100 documents (unless otherwise specified). Each document were tagged

  • wrong_language: Not Danish
  • skipped: Unsure of category
  • correct_language: Danish text where at least 80% of the text is reasonable.
  • not_language: Text where less than 80% of the text is reasonable. Takes priority over wrong_language.

Additionally, each block was tagged for pornography (yes/no) and offensiveness (yes/no).

"},{"location":"intercoder_reliability/#text-proportions","title":"Text proportions","text":"

Kenneth (Session: test)

  • Date: 2022-09-05
  • Sentences tagged: 102
  • Documents tagged: na

Proportions:

  • 69.16% of characters is correct_language
  • 25.66% of characters is not_language
  • 2.74% of characters is skipped
  • 2.45% of characters is wrong_language
  • 0.00% of characters is porn
  • 0.00% of characters is offensive

Kenneth (Session: 1)

  • Date: 2022-09-06
  • Sentences tagged: 292
  • Documents tagged: 100

Proportions:

  • 68.03% of characters is correct_language
  • 29.19% of characters is not_language
  • 2.10% of characters is skipped
  • 0.68% of characters is wrong_language
  • 0.00% of characters is porn
  • 1.38% of characters is offensive

Lasse (Session: 1)

  • Date: 2022-09-07
  • Sentences tagged: 336
  • Documents tagged: 100

Proportions:

  • 68.02% of characters is correct_language
  • 30.97% of characters is not_language
  • 1.01% of characters is wrong_language
  • 0.26% of characters is porn
  • 0.00% of characters is offensive
"},{"location":"intercoder_reliability/#intercoder-reliability","title":"Intercoder Reliability","text":"

Kenneth (Session: test) vs Kenneth - (Session: 1)

  • Cohen's Kappa (all categories): 0.8242 (Overlap in sentences: 98)

  • Cohen's Kappa (correct_language vs not correct_language): 0.9075 (Overlap in sentences: 98)

Kenneth (Session: test) vs Lasse - (Session: 1)

  • Cohen's Kappa (all categories): 0.8140 (Overlap in sentences: 95)

  • Cohen's Kappa (correct_language vs not correct_language): 0.8389 (Overlap in sentences: 95)

Kenneth (Session: 1) vs Lasse - (Session: 1)

  • Cohen's Kappa (all categories): 0.6767 (Overlap in sentences: 245)

  • Cohen's Kappa (correct_language vs not correct_language): 0.7259 (Overlap in sentences: 245)

Comparison with mC4

Note: mC4 did have a high degree of repititious texts. Similarly it did when texts blocks where not language they were often something like:

2lineStart%22%3A%22%22%2C%22placeholder%22%3A1%2C%22extName%22%3A%22nowiki%22%7D\"\" class=\"\"placeholder placeholder-ext\"\" contenteditable=\"\"false\"\">]&#x200b;</span></a></sup>&#x200b;</span>, at en lurifaks som Jimmy page, bruger MIT navn til opfindelsen! SV<span data-rte-instance=\"\"1524-12953202845f3523698f3f1\"\" data-rte-meta=\"\"%7B%22type%22%3A%22ext%22%2C%22wikitext%22%3A%22%3Cref%3ESVIN%3C%5C%2Fref%3E%22%2C%22lineStart%22%3A%22%22%2C%22placeholder%22%3A1%2C%22extName%22%3A%22ref%22%7D\"\" class=\"\"placeholder placeholder-ext\"\" contenteditable=\"\"false\"\"><sup data-rte-washtml=\"\"1\"\" id=\"\"cite_ref-2\"\" class=\"\"reference\"\" data-rte-attribs=\"\"\n

While non-language texts in NAT was often menu bars, contact information, or navigation.

Kenneth (Session: 1)

  • Date: 2022-09-06
  • Sentences tagged: 325
  • Documents tagged: 100

Proportions:

  • 62.47% of characters is correct_language
  • 34.88% of characters is not_language
  • 1.27% of characters is skipped
  • 1.38% of characters is wrong_language
  • 3.25% of characters is porn
  • 0.00% of characters is offensive
"},{"location":"models/","title":"Models","text":"

This section gives an overview of the models available through the DFM project. The models are available through the Huggingface model hub. To avoid duplicating information surrounding the models and the information regarding the models are available at the models model sheet.

"},{"location":"models/#text-models","title":"Text Models","text":"Model Model type Size (parameters) dfm-encoder-large-v1 Encoder large (355M) dfm-encoder-medium-v1 Encoder medium (110M) dfm-encoder-small-v1 Encoder small (22M)"},{"location":"models/#speech-models","title":"Speech Models","text":"Model Model type xls-r-300m-danish Pretrained wav2vec2.0 model xls-r-300m-danish-nst-cv9 Automatic speech recognition chcaa/xls-r-300m-nst-cv9-da Automatic speech recognition"},{"location":"datasheets/danews/","title":"DaNews","text":"

Version: 1.0.0

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

license: Not publicly available.

DaNews consists of articles from Danish news and tabloid media from 1 December 2019 to 30 April 2021. The articles stem from multiple news sources, including both online of physical newspapers.

DaNews consists of 403 million tokens 93% were left after quality filtering and deduplication.

"},{"location":"datasheets/danews/#datasheet","title":"Datasheet","text":"

Following the recommendation and framework of [5] we add the following datasheet.

"},{"location":"datasheets/danews/#motivation","title":"Motivation","text":"

For what purpose was the dataset created? Who created the dataset? Who funded the creation of the dataset?

DANews was collected as a part of the HOPE project, examining news coverage during the COVID-19 pandemic. The purpose was to train a model to understand how the novelty and resonance imprint of COVID-19 as a case of crisis compared to non-crises news imprints.

Any other comments?

No.

"},{"location":"datasheets/danews/#composition","title":"Composition","text":"

How many instances are there in total (of each type, if appropriate)?

The unfiltered dataset consists of 713 429 documents including a total of 403 089 625 tokens.

What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)?

Instances of the dataset are Danish articles derived from Danish tabloids or news media.

Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set?

Prior to filtering DaNews dataset contains all digitized news articles from the given period across the sources.

What data does each instance consist of? \u201cRaw\u201d data (e.g., unprocessed text or images) or features? In either case, please provide a description.

Each instance consists of the following columns

'ArticleUrl', 'Heading', 'SubHeading', 'Lead', 'Paragraph', 'PublishDate', 'BodyText', \n'Captions', 'Authors', 'Source', 'WordCount', 'ArticleId', 'PageIds', 'Section', 'text'\n

Where we constructed the columns text column by joining the Heading, SubHeading using newline. If the text field is empty it is ignored and no newline is added. The we join the resulting string with the BodyText using two newlines.

During the quality filtering, we add the following indicator columns:

'passed_quality_filter', 'filtered_by_max_chr_length', 'filtered_by_doc_length', \n'filtered_by_mean_word_length', 'filtered_by_alpha_ratio', 'filtered_by_stop_word', \n'filtered_by_symbol_2_word_hashtag', 'filtered_by_symbol_2_word_ellipsis',\n'filtered_by_line_bullets_or_ellipsis', 'filtered_by_duplicate_lines_chr_fraction',\n'filtered_by_duplicate_paragraph_chr_fraction', 'filtered_by_top_ngram_chr_fraction',\n'filtered_by_duplicate_ngram_chr_fraction', 'is_duplicate'\n

Is there a label or target associated with each instance? If so, please provide a description.

No.

Is any information missing from individual instances? If so, please provide a description, explaining why this information is missing (e.g., because it was unavailable). This does not include intentionally removed information but might include, e.g., redacted text.

The team of researchers at the Humanities Computing Aarhus (CHCAA) have not removed any information from the instances.

Are relationships between individual instances made explicit (e.g., users\u2019 movie ratings, and social network links)? If so, please describe how these relationships are made explicit.

The metadata columns denote the relationship between articles including the date of publication, sections, and authors.

Are there recommended data splits (e.g., training, development/validation, testing)? If so, please provide a description of these splits, explaining the rationale behind them.

There are not splits performed on this dataset.

Are there any errors, sources of noise, or redundancies in the dataset? If so, please provide a description.

News sources can publish their content both in an online and printed format which would lead to similar instances in the dataset. To alleviate this redundancy by removing near-duplicates (see Preprocessing/cleaning/labeling).

Is the dataset self-contained, or does it link to or otherwise rely on external resources (e.g., websites, tweets, other datasets)?

Articles are intended to tell a self-contained story but can include external references such as tweets or website URLs.

Does the dataset contain data that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety?

Articles often describe content that is considered offensive, insulting, or threatening.

"},{"location":"datasheets/danews/#collection-process","title":"Collection Process","text":"

What mechanisms or procedures were used to collect the data (e.g., hardware apparatuses or sensors, manual human curation, software programs, software APIs)?

A team of researchers at the Center for Humanities Computing Aarhus (CHCAA) obtained this dataset using a third-party API as well as a manual transfer from one of the parties. The API was limited to only a subset of articles agreed upon within the agreements.

If the dataset is a sample from a larger set, what was the sampling strategy?

The dataset is not a sample, but is a filtered version of the full dataset, see Preprocessing/cleaning/labeling for more on this.

Who was involved in the data collection process? A team of researchers at the Center for Humanities Computing Aarhus (CHCAA) obtained this dataset using a third party API as well as a manual transfer from some of the parties and would like to thank the dataset owners for access to their articles.

Over what timeframe was the data collected?

The dataset includes articles from 1 December 2019 to 30 April 2021.

Were any ethical review processes conducted?

No.

"},{"location":"datasheets/danews/#preprocessingcleaninglabeling","title":"Preprocessing/cleaning/labeling","text":"

Was any preprocessing/Cleaning/Labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)?

DaNews has been filtered using a series of heuristic filters as well as removing repetitious texts. Following the filtering, DaNews is deduplicated to remove exact and near-duplicates.

Of all documents, 9% were filtered based due to low-quality and 4% because they were near-duplicates.

For quality filtering, DaNews applies a filter akin to [2] which contains text that:

  • Contain at least 2 Danish stopwords. For the stopword list we use the one used in SpaCy v.3.1.4.
  • Have a mean word length between 3 and 10.
  • Have a token length between 50 and 100,000.
  • Have less than 5,000,000 characters.
  • Have less than 60% of words containing an alphabetic character.
  • Have a symbol-to-word ratio lower than 10% for hashtags and ellipsis.
  • Have less than 90% of lines starting with a bullet point.
  • have less than 30% of lines ending with an ellipsis.

  • Have a low high degree of repetitious text:

  • Have less than 20% of characters contained within duplicate lines.
  • Have less than 20% of characters contained within duplicate paragraphs.
  • Where the top 2-4 grams constitute less than 20%, 18%, 16%, respectively, of the text.
  • Where the duplicate 5-10 grams constitute less than 25%, 24%, 23%, 22%, 21%, 20% of the text, respectively.

The deduplication removed all documents with a 13-gram Jaccard similarity higher than 80% following the MinHash algorithm [1] using 128 permutations. The MinHash algorithm is a probabilistic data structure for approximating the Jaccard similarity between two sets.

Is the software used to preprocess/clean/label the instances available?

Yes, the scripts are available here. the scripts use version 0.0.2 of the dfm package.

"},{"location":"datasheets/danews/#uses","title":"Uses","text":"

Has the dataset been used for any tasks already?

Yes, the dataset has been used to pre-train Danish language models. Parts of the dataset have also been used in [3] and [4]

Is there a repository that links to any or all papers or systems that use the dataset?

No.

What (other) tasks could the dataset be used for?

The scale of the dataset makes it suitable for NLP tasks such as language modeling. Similarly, the structure of the articles makes it a suitable dataset for training text summarisation models.

Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses?

This dataset is static and thus does not evolve over time with the language. A consequence of this is that it will become increasingly outdated over time.

Are there tasks for which the dataset should not be used?

This dataset contains Danish articles and thus should not be used for non-Danish language tasks.

As the writers of the content are predominantly journalists, it reflects a certain writing style which is unlikely to reflect the Danish language as a whole.

"},{"location":"datasheets/danews/#distribution","title":"Distribution","text":"

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

Data will only be available at the entity during the project. If you wish access to the dataset you will have to come to an agreement with the individuals Danish newspapers.

"},{"location":"datasheets/danews/#citation","title":"Citation","text":"

If you wish to cite this work please see our GitHub page for an up-to-date citation: https://github.com/centre-for-humanities-computing/danish-foundation-models

"},{"location":"datasheets/danews/#references","title":"References:","text":"
  • [1] Broder, Andrei Z. \"On the resemblance and containment of documents.\" Proceedings. Compression and Complexity of SEQUENCES 1997 (Cat. No. 97TB100171). IEEE, 1997.
  • [2] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., Aslanides, J., Henderson, S., Ring, R., Young, S., Rutherford, E., Hennigan, T., Menick, J., Cassirer, A., Powell, R., Driessche, G. van den, Hendricks, L. A., Rauh, M., Huang, P.-S., \u2026 Irving, G. (2021). Scaling Language Models: Methods, Analysis & Insights from Training Gopher. https://arxiv.org/abs/2112.11446v2
  • [3] Baglini, R. B., Nielbo, K. L., H\u00e6strup, F., Enevoldsen, K., Vahlstrup, P. B., & Roepstorff, A. (2021, June 2). When no news is bad news: Detection of negative events from news media content. https://2021.dhbenelux.org/
  • [4] Nielbo, K. L., Baglini, R. B., Vahlstrup, P. B., Enevoldsen, K. C., Bechmann, A., & Roepstorff, A. (2021, January). News information decoupling: An information signature of catastrophes in legacy news media. https://eadh2020-2021.org/
  • [5] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daum\u00e9 III, and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
"},{"location":"datasheets/daradio/","title":"DaRadio Datasheet","text":"

Version: 1.0.0

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

License: Not publicly available.

DaRadio consists of radio broadcasts from the Danish radio stations DR P1 and Radio24Syv, and contains approximately 140.000 hours of speech. DaRadio includes all shows aired on DR P1 from 2005 to 2021, and all shows aired on Radio24Syv from 2011 to 2019.

DaRadio has been deduplicated using a series of heuristics based on metadata. For more on deduplication, see the data cleaning section further below.

"},{"location":"datasheets/daradio/#datasheet","title":"Datasheet","text":"

Following the recommendation and framework of [1], we add the following datasheet.

"},{"location":"datasheets/daradio/#motivation","title":"Motivation:","text":"

For what purpose was the dataset created? Who created the dataset? Who funded the creation of the dataset?

Data included in DaRadio was collected following the Danish Legal Deposit Act by the Royal Danish Library (RDL). From this, a dataset of Danish speech-only radio was derived by RDL. The dataset was created for research purposes, including training a Danish wav2vec2.0 model.

The dataset was preprocessed to remove duplicates by a team of researchers at the Center for Humanities Computing, Aarhus University (CHC) with collaborators from the Danish speech-processing company Alvenir.

"},{"location":"datasheets/daradio/#composition","title":"Composition","text":"

What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)?

Instances of the dataset include an mp3 file for each show aired on the two staions within the period. Further metadata include information on date and time of airing, title, short description of the show, and various internal identifiers used by RDL.

How many instances are there in total (of each type, if appropriate)?

DaRadio consists of a total of 215.582 hours of unprocessed Danish speech radio shows across two stations, DR P1 and Radio24syv. The table below shows the distribution over the stations with and without heuristic rerun removal.

Source Duration (hours) Reruns removed P1 145.160 False 97.401 True Radio24syv 70.422 False 44.569 True

Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set?

The dataset contains all shows from the two stations in the time period (2005-2021 for DR P1 and 2011-2019 for Radio24syv).

If the dataset is a sample from a larger set, what was the sampling strategy?

The dataset is a subset of all Danish radio. The two stations were chosen for the dataset as they are talk-radio only.

Who was involved in the data collection process?

The RDL collects Danish radio shows and constructed DaRadio for handing to researchers at CHC.

Over what timeframe was the data collected?

The dataset includes radio shows from the period 2005 to 2021.

Were any ethical review processes conducted?

The RDL collects radio shows in adherence to Danish Archival laws. DaRadio was constructed for a research project, for which a project proposal was accepted by RDL. No other ethical review processes were conducted.

"},{"location":"datasheets/daradio/#preprocessingcleaninglabeling","title":"Preprocessing/cleaning/labeling","text":"

Was any preprocessing/Cleaning/Labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)?

DaRadio has been deduplicated using a series of heuristic filters and all files have been converted to 16 Khz .wav files.

Reruns/duplicates were identified by the following rules:

  • If the phrase \"sendt f\u00f8rste gang\" [\"aired the first time\"] or \"genudsendelse\" [\"rerun\"] appeared in the show description.
  • If the title contained \"(G)\" (short for \"genudsendelse\"))
  • If the show was broadcast between 23:00 and 5:00.

The deduplication was coded and conducted by researchers at CHC.

Is the software used to preprocess/clean/label the instances available?

The scripts are available at the following GitHub repository: link.

"},{"location":"datasheets/daradio/#uses","title":"Uses","text":"

Has the dataset been used for any tasks already?

Yes, the dataset has been used to pre-train a Danish wav2vec2.0 model.

Is there a repository that links to any or all papers or systems that use the dataset?

No, but as of 23/10/16 no others have used the dataset.

What (other) tasks could the dataset be used for?

As the dataset only contains un-labelled data, i.e. no transcriptions, it is mainly designed for pre-training language models. However, given the metadata and re-occuring hosts, further processing might make it possible to train e.g. text-to-speech systems.

Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses?

This dataset is static and does not evolve over time with the language, thus will become increasingly outdated over time.

"},{"location":"datasheets/daradio/#distribution","title":"Distribution","text":"

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

Data will only be available at the entity during the project. An equivalent or updated dataset can be requested at the Royal Danish Library.

"},{"location":"datasheets/daradio/#citation","title":"Citation","text":"

If you wish to cite this work please see our GitHub page for an up to date citation: https://github.com/centre-for-humanities-computing/danish-foundation-models

"},{"location":"datasheets/daradio/#references","title":"References:","text":"
  • [1] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daum\u00e9 III, and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
"},{"location":"datasheets/hopetwitter/","title":"HopeTwitter","text":"

Version: 1.0.0

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

license: Not publicly available.

HopeTwitter consists of tweets collected from the Twitter API using a stopword list and consists of 32.5 million tweets across 538,398 unique users. HopeTwitter includes tweets from 2019-01-01 to 2021-04-30.

HopeTwitter, have been filtered to only include Danish tweets, based on language tag from Twitter API. Similarly, HopeTwitter have had low-quality tweets have removed and then deduplicated to remove exact and near-duplicates. For more on data cleaning see section; \"Preprocessing/cleaning/labeling\".

HopeTwitter includes a total of 0.97 billion tokens before filtering and includes 0.48 billion (50%) after.

"},{"location":"datasheets/hopetwitter/#datasheet","title":"Datasheet","text":"

Following the recommendation and framework of [3] we add the following datasheet.

"},{"location":"datasheets/hopetwitter/#motivation","title":"Motivation","text":"

**For what purpose was the dataset created? Who created the dataset? Who funded the creation of the dataset? **

HopeTwitter was initially collected as a part of the HOPE project, examining societal behaviour during the COVID-19 pandemic. Next, HopeTwitter was cleaned in preparation for pre-training Danish language models by a team of researchers at Center for Humanities Computing Aarhus (CHCAA), using a codebase jointly developed with partners from academia and industry, including KMD, Ekstra Bladet, Bristol University and Deepdivr. For more on collaborators on this project see the GitHub repository.

Any other comments?

No.

"},{"location":"datasheets/hopetwitter/#composition","title":"Composition","text":"

What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)?

HopeTwitter consists of tweets containing at least one of a series of stopwords, collected through the Twitter API. See \"If the dataset is a sample from a larger set, what was the sampling strategy?\" for the stopword list.

How many instances are there in total (of each type, if appropriate)?

The dataset consist of 32,499,019 documents where 14,399,284 (44%) were considered duplicates.

Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set?

No. It does not contain all instances of Danish Twitter as there are likely some Danish tweets which does not include a stopword.

Is there a label or target associated with each instance? If so, please provide a description.

No.

Are there recommended data splits (e.g., training, development/validation, testing)? If so, please provide a description of these splits, explaining the rationale behind them.

No splits are performed on this dataset.

If the dataset is a sample from a larger set, what was the sampling strategy?

Tweets are streamed continuously using queried a set of the highest frequency Scandinavian-specific keywords from Danish, Norwegian (Bokm\u00e5l) and Swedish, resulting in the following list:

aften, aldrig, alltid, altid, andet, arbejde, bedste, beh\u00f6ver, beh\u00f8ver, beklager,\nber\u00e4tta, betyr, blev, blevet, blir, blitt, blive, bliver, bruge, burde, b\u00e4ttre, b\u00e5e\nb\u00f8r, deim, deires, ditt, drar, drepe, dykk, dykkar, d\u00e4r, d\u00f6d, d\u00f6da, d\u00f8d, d\u00f8de, efter,\nelsker, endnu, faen, fandt, feil, fikk, finner, flere, forst\u00e5r, fortelle, fortfarande,\nfortsatt, fort\u00e6lle, fr\u00e5n, f\u00e5, f\u00e5et, f\u00e5r, f\u00e5tt, f\u00f6rl\u00e5t, f\u00f6rsta, f\u00f6rs\u00f6ker, f\u00f8r, f\u00f8rst,\nf\u00f8rste, gick, gikk, gillar, gjennom, gjerne, gjorde, gjort, gj\u00f8r, gj\u00f8re, godt, g\u00e5, g\u00e5ng,\ng\u00e5r, g\u00f6ra, g\u00f8r, g\u00f8re, hadde, hall\u00e5, havde, hedder, helt, helvete, hende, hendes, hennes,\nherregud, hjelp, hjelpe, hjem, hj\u00e4lp, hj\u00e5, hj\u00e6lp, hj\u00e6lpe, honom, hossen, hvem, hvis,\nhvordan, hvorfor, h\u00e4nder, h\u00e4r, h\u00e5ll, h\u00e5ller, h\u00f8r, h\u00f8re, h\u00f8rer, igjen, ikkje, ingenting,\ninkje, inte, intet, jeres, j\u00e4vla, kanske, kanskje, kender, kjenner, korleis, kvarhelst,\nkveld, kven, kvifor, k\u00e4nner, ledsen, lenger, lidt, livet, l\u00e4ngre, l\u00e5t, l\u00e5ter, l\u00e6nge,\nmeget, menar, mycket, mykje, m\u00e5, m\u00e5de, m\u00e5nga, m\u00e5r, m\u00e5ske, m\u00e5ste, m\u00e5tte, navn, nogen,\nnoget, nogle, noko, nokon, nokor, nokre, n\u00e5gon, n\u00e5got, n\u00e5gra, n\u00e5n, n\u00e5r, n\u00e5t, n\u00f8dt,\nocks\u00e5, ogs\u00e5, pengar, penger, pratar, pr\u00f8ver, p\u00e5, redan, rundt, r\u00e4tt, sagde, saker,\nsamma, sammen, selv, selvf\u00f8lgelig, sidan, sidste, siger, sikker, sikkert, sj\u00e4lv, skete,\nskjedde, skjer, skulle, sluta, slutt, snakke, snakker, snill, sn\u00e4lla, somt, stadig,\nstanna, sted, st\u00e5r, synes, s\u00e4ger, s\u00e4tt, s\u00e5, s\u00e5dan, s\u00e5g, s\u00e5nn, tager, tiden, tilbage,\ntilbake, tillbaka, titta, trenger, trodde, troede, tror, tv\u00e5, tycker, t\u00e4nker, uden,\nundskyld, unnskyld, urs\u00e4kta, uten, varf\u00f6r, varit, varte, veldig, venner, verkligen,\nvidste, vilken, virkelig, visste, v\u00e4g, v\u00e4l, v\u00e4ldigt, v\u00e4n, v\u00e5r, v\u00e5ra, v\u00e5re, v\u00e6k, v\u00e6r, \nv\u00e6re, v\u00e6ret, \u00e4lskar, \u00e5h, \u00e5r, \u00e5t, \u00f6ver\n

Who was involved in the data collection process?

A team of researchers at the Center for Humanities Computing Aarhus (CHCAA), including Kristoffer Nielbo and Peter Bjerregaard Vahlstrup, in collaboration with Rebekah Baglini, at the School of Communcation and Culture at Aarhus university.

Over what timeframe was the data collected?

The dataset includes tweets from the period 2019-01-01 to 2021-04-30.

Were any ethical review processes conducted?

No

"},{"location":"datasheets/hopetwitter/#preprocessingcleaninglabeling","title":"Preprocessing/cleaning/labeling","text":"

Was any preprocessing/Cleaning/Labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)?

Firstly, HopeTwitter had non-Danish tweets removed, after which a series of heuristic filters were applied, including the removal of repetitious texts. Following the filtering, HopeTwitter was deduplicated, removing both exact duplicates and near-duplicates.

Of all documents, 3,023,427 (9%) were filtered due to low-quality and 14,399,284 (33%) because they were near-duplicates.

For the quality filtering, HopeTwitter applies a filter akin to [2] which contains text that:

  • Contain at least 2 Danish stopwords. For the stopword list we use the one used in SpaCy v.3.1.4.
  • Have a mean word length between 2 and 14.
  • Have a token length between 10 and 100,000.
  • Have less than 5,000,000 characters.
  • Have less than 60% of words containing an alphabetic character.

  • Have low high degree of repetitious text:

  • Have less than 20% of characters contained within duplicate lines.
  • Have less than 20% of characters contained within duplicate paragraphs.
  • Where the top 2-4 grams constitute less than 20%, 18%, 16%, respectively, of the text.
  • Where the duplicate 5-10 grams constitute less than 25%, 24%, 23%, 22%, 21%, 20% of the text, respectively.

The deduplication removed all documents with a 10-gram Jaccard similarity higher than 80% following the MinHash algorithm [1] using 128 permutations. The MinHash algorithm is a probabilistic data structure for approximating the Jaccard similarity between two sets.

Is the software used to preprocess/clean/label the instances available?

Yes, the scripts are available here. The scripts use version 0.0.2 of the dfm package.

"},{"location":"datasheets/hopetwitter/#uses","title":"Uses","text":"

Has the dataset been used for any tasks already?

Yes, the dataset has been used to pre-train Danish language models. Parts of the dataset have also been used in HOPE project reports and in [4].

Is there a repository that links to any or all papers or systems that use the dataset?

There is a website for the HOPE project for which the dataset was initially collected. This website contains report and articles regarding the dataset.

What (other) tasks could the dataset be used for?

The scale of the dataset makes it suitable for NLP tasks such as language modelling. Similarly, one could imagine using the conversation structure could be used to train conversational chatbots.

Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses?

This dataset is static and thus does not evolve over time with the language. A consequence of this is that it will become increasingly outdated over time. However, it possible to extend the dataset by a continual collection of tweets.

Are there tasks for which the dataset should not be used?

HopeTwitter contains Danish tweets and thus should not be used for non-Danish language tasks.

As the writers of the content is predominantly journalists, politicians, influencers, and academics, it reflects a certain social group which is unlikely to reflect Danish population as a whole.

"},{"location":"datasheets/hopetwitter/#distribution","title":"Distribution","text":"

Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created?

Data will only be available at the entity during the project. After the project the data will be archived for a period of five years to comply with the university [policy] for research integrity. After the five years, the data will be registered at the national archives as required by executive order 514 for potential long-term deposit.

"},{"location":"datasheets/hopetwitter/#citation","title":"Citation","text":"

If you wish to cite this work please see our GitHub page for an up to date citation: https://github.com/centre-for-humanities-computing/danish-foundation-models

"},{"location":"datasheets/hopetwitter/#references","title":"References:","text":"
  • [1] Broder, Andrei Z. \"On the resemblance and containment of documents.\" Proceedings. Compression and Complexity of SEQUENCES 1997 (Cat. No. 97TB100171). IEEE, 1997.
  • [2] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., Aslanides, J., Henderson, S., Ring, R., Young, S., Rutherford, E., Hennigan, T., Menick, J., Cassirer, A., Powell, R., Driessche, G. van den, Hendricks, L. A., Rauh, M., Huang, P.-S., \u2026 Irving, G. (2021). Scaling Language Models: Methods, Analysis & Insights from Training Gopher. https://arxiv.org/abs/2112.11446v2
  • [3] T. Gebru, J. Morgenstern, B. Vecchione, J. W. Vaughan, H. Wallach, H. Daum\u00e9 III, and K. Crawford. Datasheets for datasets. arXiv preprint arXiv:1803.09010, 2018.
  • [4]\u00a0Johansen, N., Marjanovic, S. V., Kjaer, C. V., Baglini, R. B., & Adler-Nissen, R. (2022). Ridiculing the \u201ctinfoil hats:\u201d Citizen responses to COVID-19 misinformation in the Danish facemask debate on Twitter. Harvard Kennedy School Misinformation Review. https://doi.org/10.37016/mr-2020-93
"},{"location":"datasheets/netarkivet_text/","title":"NAT: Netarkivet Text","text":"

Version: 1.0.0

Homepage: https://github.com/centre-for-humanities-computing/danish-foundation-models

license: Not publicly available.

This datasheet is currently being revised \ud83d\udee0\ufe0f

"}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 00000000..0f8724ef --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz new file mode 100644 index 00000000..5745efb2 Binary files /dev/null and b/sitemap.xml.gz differ