Skip to content

Commit

Permalink
compound word processing: added some words to prefixes, tagged some o…
Browse files Browse the repository at this point in the history
…thers to increase coverage
  • Loading branch information
merisiga committed Jun 13, 2024
1 parent 2307d8d commit ba18a4d
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 46 deletions.
40 changes: 11 additions & 29 deletions src/fst/morphology/root.lexc
Original file line number Diff line number Diff line change
Expand Up @@ -692,28 +692,13 @@ Multichar_Symbols
@P.Stem.Single@ ! does not participate in compounding (used for words that would normally compound)
@R.Stem.Single@
@D.Stem.Single@ ! more restrictive than @R.Part.One@
@P.Stem.Final@ ! may be only the last part: alam+N, ülem+N, vana+N
@R.Stem.Final@
@D.Stem.Final@
@P.Stem.Nom@ ! Even if this word is Sg Nom, smth may follow it
@R.Stem.Nom@ ! Previous word should have been in Sg Nom
@D.Stem.Nom@ ! Previous word should not have been in Sg Nom
@P.Stem.Verb@ ! to remember that A or N was derived from verb
@R.Stem.Verb@
@D.Stem.Verb@

! @P.Stem.ise@ ! enese + anything
! @R.Stem.ise@
! @D.Stem.ise@
! @P.Stem.üle@ ! üle + geoname + line (üle+siberi+line)
! @R.Stem.üle@
! @D.Stem.üle@
! @P.Stem.vähe@ ! vähe, puht + adjective
! @R.Stem.vähe@
! @D.Stem.vähe@
! @P.Stem.topelt@ ! Adv topelt et al; Adv + anything
! @R.Stem.topelt@
! @D.Stem.topelt@
@P.Stem.Guessed@ ! stem is guessed, it is not from the lexicon
@R.Stem.Guessed@
@D.Stem.Guessed@ ! if a word has it, then this word cannot follow a guessed stem
Expand Down Expand Up @@ -1105,8 +1090,8 @@ LEXICON Nonfirst

! D.Der means that the previous word should not be a derived one;
! it may be too restrictive here...
! previous word should not be allowed only single, nor one that can be only final
@D.Stem.Single@@D.Stem.Final@@C.Cap@@D.Case.PlNom@@D.Der.N@@D.Der.A@@D.Der.Adv@ Latter ; ! a latter part
! previous word should not be allowed only single
@D.Stem.Single@@C.Cap@@D.Case.PlNom@@D.Der.N@@D.Der.A@@D.Der.Adv@ Latter ; ! a latter part


! the nature of the previous word is reflected by flag diacritics that were set back then
Expand Down Expand Up @@ -1354,32 +1339,29 @@ LEXICON LatterVerb ! a verb derivation

LEXICON StartCompoundException

sama+Pron+Sg+Nom:sama NextPlainLatterAdjective ; ! samasuur
ise+Pron+Sg+Gen:enese NextLatterNoun ; ! eneseabi, enesekriitika etc
ise+Pron+Sg+Gen:enese NextPlainLatterAdjective ; ! enesepunane, eneseuhke
sama+Pron+Sg+Nom:sama NextPlainLatterAdjective "weight: 4 "; ! samasuur
ise+Pron+Sg+Gen:enese NextLatterNoun "weight: 4 "; ! eneseabi, enesekriitika etc
ise+Pron+Sg+Gen:enese NextPlainLatterAdjective "weight: 4 "; ! enesepunane, eneseuhke

igaüks+Pron+Sg+Gen#:iga#ühe NextLatterNoun ; ! igaühekriitika etc
igaüks+Pron+Sg+Gen#:iga#ühe NextLatterNoun "weight: 7 " ; ! igaühekriitika etc
mitu+Pron+Sg+Gen#mitme NextLatterNoun "weight: 5 " ; ! mitmepäevareis

! general pattern is Adverb + LatterAdjective_v
! thus we must add Adverb + PlainLatterAdjective, Adverb + LatterAdjective_ne

üle+Adv:üle NextLatterProperNounGeo "weight: 3 " ; ! ülesiberiline ?
üle+Adv:üle NextLatterNoun "weight: 3 " ; ! ületoodang etc
üle+Adv:üle NextPlainLatterAdjective "weight: 3 " ; ! ülesuur ???
üle+Adv:üle NextLatterAdjective_ne "weight: 3 " ; ! üleaastaline
vähe+Adv:vähe NextPlainLatterAdjective "weight: 5 " ; ! vähesuur
vähe+Adv:vähe NextLatterAdjective_ne "weight: 5 " ; ! vähekompilatiivne; vähe + kalor +Der/line; not väheautone
puht+Adv:puht NextPlainLatterAdjective "weight: 9 " ; ! puhtsuur
puht+Adv:puht NextLatterAdjective_ne "weight: 9 " ; ! puhtkompilatiivne

otse+Adv:otse NextNounAdj "weight: 6 " ;
püsti+Adv:p˘üsʲti NextNounAdj "weight: 6 " ;
järel+Adv:järel NextNounAdj "weight: 5 " ;
koos+Adv:k˘oos NextNounAdj "weight: 4 " ;
kõrval+Adv:kõrval NextNounAdj "weight: 5 " ;
läbi+Adv:läbi NextNounAdj "weight: 4 " ;
ratsa+Adv:r˘atsa NextNounAdj "weight: 10 " ;
taga+Adv:taga NextNounAdj "weight: 5 " ;
topelt+Adv:topelt NextNounAdj "weight: 9 " ; ! topeltriiul, topelthallikas, ...
tagasi+Adv:tagasi NextLatterNoun "weight: 4 " ;
ümber+Adv:˘ümber NextLatterNoun "weight: 4 " ;
tasa+Adv:tasa NextNounAdj "weight: 7 " ;
vallas+Adv:vallas NextNounAdj "weight: 11 " ;
vastas+Adv:vastas NextNounAdj "weight: 8 " ;
vastu+Adv:v˘astu NextNounAdj "weight: 4 " ;
Expand Down
2 changes: 1 addition & 1 deletion src/fst/morphology/stems/comparative_adjectives.lexc
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ vapram+A+Comp:v˘apram MAGUS "weight: 11 " ;
varasem+A+Comp:varasem MAGUS "weight: 6 " ;
varem+A+Comp:varem MAGUS "weight: 5 " ;
varjulisem+A+Comp:varjulisem MAGUS "weight: 11 " ;
vasem+A+Comp:vasem MAGUS "weight: 8 " ;
@P.Stem.Nom@vasem+A+Comp:@P.Stem.Nom@vasem MAGUS "weight: 8 " ;
vastikum+A+Comp:vasʲtikum MAGUS "weight: 10 " ;
vastupidavam+A+Comp:v˘astu#pidavam MAGUS "weight: 9 " ;
vastutustundlikum+A+Comp:vastutus#t˘undlikum MAGUS "weight: 11 " ;
Expand Down
16 changes: 8 additions & 8 deletions src/fst/morphology/stems/nouns.lexc
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,7 @@ alalisvool+N:alalis#v˘ool%>%{pl.e%} KOON "weight: 11 " ;
alaltütlev+N:alalt#˘ütlev MAGUS "weight: 11 " ;
alalõug+N:ala#l˘õuG1%>%{pl.u%} PIIM "weight: 10 " ;
alalütlev+N:alal#˘ütlev MAGUS "weight: 11 " ;
@P.Stem.Final@@P.Stem.Nom@alam+N:@P.Stem.Final@@P.Stem.Nom@alam MAGUS "weight: 8 " ;
@P.Stem.Nom@alam+N:@P.Stem.Nom@alam MAGUS "weight: 8 " ;
alamaadel+N:alam#˘aadel TOOBER "weight: 11 " ;
alamalseisev+N:alama#ls˘eisE2v MAGUS "weight: 11 " ;
alamik+N:ˈalam˘ik ELANIK "weight: 11 " ;
Expand Down Expand Up @@ -17310,7 +17310,7 @@ liipjalg+N:l˘iip#j˘alG1%>%{pl.u%} JALG "weight: 11 " ;
@D.Case.Nom@liir+N:@D.Case.Nom@l˘iir TAUD "weight: 9 " ;
liisingufirma+N:liisʲingu#firma PERE "weight: 11 " ;
liisk+N:l˘iisK1%>%{pl.e%}%{rare%} KOON "weight: 10 " ;
liist+N:l˘iist%>%{pl.e%} KOON "weight: 9 " ;
@P.Stem.Nom@liist+N:@P.Stem.Nom@l˘iist%>%{pl.e%} KOON "weight: 9 " ;
@P.Stem.Nom@liistak+N:@P.Stem.Nom@liistak VIRSIK "weight: 11 " ;
@P.Stem.Nom@liistik+N:@P.Stem.Nom@liistik VIRSIK "weight: 11 " ;
liisualune+N:liisu#alune SOOLANE "weight: 11 " ;
Expand Down Expand Up @@ -19591,7 +19591,7 @@ matuserahvas+N:matuse#r˘ahva KOTKAS "weight: 11 " ;
matuserong+N:matuse#r˘ong TAUD "weight: 11 " ;
matuserongkäik+N:matuse#r˘ong#k˘äik%>%{pl.e%} KOON "weight: 11 " ;
@P.Stem.Nom@matusetalitus+N:@P.Stem.Nom@matuse#talitus OLULINE "weight: 10 " ;
matš+N:m˘atš TAUD "weight: 9 " ;
@P.Stem.Nom@matš+N:@P.Stem.Nom@m˘atš TAUD "weight: 9 " ;
matšeete+N:matšeete PERE "weight: 11 " ;
@P.Stem.Single@mauk+N:@P.Stem.Single@m˘auk%>%{pl.e%}%{rare%} KOON "weight: 11 " ;
@P.Stem.Single@maur+N:@P.Stem.Single@m˘aur TAUD "weight: 11 " ;
Expand Down Expand Up @@ -32761,7 +32761,7 @@ sürjalane+N:sürjalane OLULINE "weight: 11 " ;
sürrealism+N:sürreal˘ism TAUD "weight: 10 " ;
süsadminn+N:s˘üs#adm˘inn TAUD "weight: 11 " ;
@P.Stem.Nom@süserään+N:@P.Stem.Nom@süser˘ään TAUD "weight: 11 " ;
süsi+N:süS1 MESI "weight: 8 " ;
@P.Stem.Nom@süsi+N:@P.Stem.Nom@süS1 MESI "weight: 8 " ;
süsihappegaas+N:süsi#h˘appe#g˘aasʲ TAUD "weight: 9 " ;
@P.Stem.Nom@süsik+N:@P.Stem.Nom@süsik VIRSIK "weight: 11 " ;
süsikas+N:süsika LOHAKAS "weight: 11 " ;
Expand Down Expand Up @@ -34109,7 +34109,7 @@ termokauter+N:termo#k˘auter TOOBER "weight: 11 " ;
termomeeter+N:termo#m˘eeter TOOBER "weight: 10 " ;
@P.Stem.Nom@termos+N:@P.Stem.Nom@termos SOOLANE "weight: 10 " ;
termosnõu+N:termos#n˘õu KAVA "weight: 11 " ;
termostaat+N:termo#st˘aat TAUD "weight: 10 " ;
@P.Stem.Nom@termostaat+N:@P.Stem.Nom@termo#st˘aat TAUD "weight: 10 " ;
ternes+N:t˘erne KOTKAS "weight: 11 " ;
ternespiim+N:ternes#p˘iim%>%{pl.u%}%{rare%} PIIM "weight: 11 " ;
terraarium+N:terr˘aarium MUUSEUM "weight: 11 " ;
Expand Down Expand Up @@ -35249,7 +35249,7 @@ tuhamägi+N:tuha#mäG1%{stemill%} NIMI "weight: 11 " ;
tuhandeline+N:tuhandeline OLULINE "weight: 11 " ;
tuhandene+N:tuhandene OLULINE "weight: 11 " ;
tuhandik+N:tˈuhand˘ik ELANIK "weight: 11 " ;
tuhar+N:tuhar MAGUS "weight: 10 " ;
@P.Stem.Nom@tuhar+N:@P.Stem.Nom@tuhar MAGUS "weight: 10 " ;
tuhasool+N:tuha#s˘ool%>%{pl.i%} PIIM "weight: 11 " ;
tuhastamine+N:tuhastamine OLULINE "weight: 11 " ;
tuhastamisplats+N:tuhastamis#pl˘atʲs TAUD "weight: 11 " ;
Expand Down Expand Up @@ -37418,7 +37418,7 @@ valvkond+N:v˘alvk˘onD1%>%{pl.i%} PÄEV "weight: 11 " ;
@P.Stem.Nom@vamp+N:@P.Stem.Nom@v˘amp TAUD "weight: 11 " ;
@P.Stem.Nom@vampiir+N:@P.Stem.Nom@vamp˘iir TAUD "weight: 10 " ;
@P.Stem.Nom@vampiiritar+N:@P.Stem.Nom@vampiiritar TAUD "weight: 11 " ;
@P.Stem.Final@vana+N:@P.Stem.Final@vana%>%{pl.u%} KAVA "weight: 4 " ;
vana+N:vana%>%{pl.u%} KAVA "weight: 4 " ;
vana-aasta+N:vana-˘aasta AASTA "weight: 11 " ;
@P.Stem.Nom@vanaadium+N:@P.Stem.Nom@van˘aadium MUUSEUM "weight: 11 " ;
vanaaegkond+N:vana#˘aegk˘onD1%>%{pl.i%} PÄEV "weight: 10 " ;
Expand Down Expand Up @@ -40395,7 +40395,7 @@ zoroastrist+N:zoro#astr˘ist TAUD "weight: 11 " ;
ülelend+N:üle#l˘enD1%>%{pl.e%} KOON "weight: 11 " ;
@P.Stem.Nom@üleliigsus+N:@P.Stem.Nom@üle#l˘iigsus SUULINE "weight: 11 " ;
ülelöök+N:üle#l˘öök TAUD "weight: 11 " ;
@P.Stem.Final@ülem+N:@P.Stem.Final@ülem MAGUS "weight: 7 " ;
ülem+N:ülem MAGUS "weight: 7 " ;
ülemaadel+N:ülem#˘aadel TOOBER "weight: 11 " ;
@P.Stem.Nom@ülemeelikus+N:@P.Stem.Nom@üle#meelikus OLULINE "weight: 11 " ;
ülemik+N:ˈülem˘ik ELANIK "weight: 11 " ;
Expand Down
15 changes: 12 additions & 3 deletions src/fst/morphology/stems/prefixes.lexc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ anti+Pref:anʲti»- # ;
argi+Pref:argi»- # ;
astro+Pref:astro»- # ;
bio+Pref:b˘io»- # ;
dendro+Pref:dendro»- # ;
dendro+Pref:d˘endro»- # ;
detsi+Pref:detsi»- # ;
digi+Pref:digi»- # ;
eba+Pref:eba»- # ;
Expand Down Expand Up @@ -46,6 +46,7 @@ isas+Pref:isas»- # ;
isoleer+Pref:isoleer»- # ;
iti+Pref:iti»- # ;
jae+Pref:j˘ae»- # ;
järel+Pref:järel»- # ;
kahend+Pref:kahend»- # ;
kaksis+Pref:kaksis»- # ;
kald+Pref:k˘ald»- # ;
Expand Down Expand Up @@ -89,10 +90,12 @@ nuri+Pref:nuri»- # ;
nõuka+Pref:nõuka»- # ;
nöbi+Pref:nöbi»- # ;
nüüdis+Pref:nüüdis»- # ;
otse+Pref:otse»- # ;
paisk+Pref:p˘aisk»- # ;
paleo+Pref:paleo»- # ;
para+Pref:para»- # ;
pikamaa+Pref:pika#m˘aa»- # ;
piki+Pref:piki»- # ;
piko+Pref:piko»- # ;
pisi+Pref:pisi»- # ;
polaar+Pref:pol˘aar»- # ;
Expand All @@ -102,10 +105,12 @@ pop+Pref:p˘op»- # ;
pseudo+Pref:pseudo»- # ;
psühho+Pref:psühho»- # ;
puhke+Pref:p˘uhke»- # ;
puht+Pref:p˘uht»- # ;
puhtavee+Pref:p˘uhta#v˘ee»- # ;
põhi+Pref:põhi»- # ;
põiki+Pref:põiki»- # ;
põlis+Pref:põlis»- # ;
pöörd+Pref:pöörd»- # ;
pöörd+Pref:p˘öörd»- # ;
püst+Pref:p˘üst»- # ;
raid+Pref:r˘aid»- # ;
ripp+Pref:r˘ipp»- # ;
Expand All @@ -120,11 +125,14 @@ sise+Pref:sise»- # ;
sisi+Pref:sisi»- # ;
supel+Pref:supel»- # ;
süva+Pref:süva»- # ;
taas+Pref:t˘aas»- # ;
taga+Pref:taga»- # ;
tehno+Pref:tehno»- # ;
termo+Pref:termo»- # ;
tetra+Pref:t˘etra»- # ;
toor+Pref:toor»- # ;
toor+Pref:t˘oor»- # ;
tootmis+Pref:t˘ootmis»- # ;
topelt+Pref:topelt»- # ;
turbo+Pref:turbo»- # ;
umb+Pref:˘umb»- # ;
vaeg+Pref:v˘aeg»- # ;
Expand All @@ -141,5 +149,6 @@ vääris+Pref:vääris»- # ;
ühis+Pref:ühis»- # ;
üla+Pref:üla»- # ;
üld+Pref:˘üld»- # ;
üle+Pref:üle»- # ;
üli+Pref:üli»- # ;
ürg+Pref:˘ürg»- # ;
14 changes: 9 additions & 5 deletions src/import/fsgt2final.sh
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ cat comparative_adjectives.tmp1 \
| sed '/^noorem+/s/^\([^:]*+A+Comp\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^parem+/s/^\([^:]*+A+Comp\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^pahem+/s/^\([^:]*+A+Comp\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^vasem+/s/^\([^:]*+A+Comp\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^suurem+/s/^\([^:]*+A+Comp\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^vanem+/s/^\([^:]*+A+Comp\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^vähem+/s/^\([^:]*+A+Comp\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
Expand Down Expand Up @@ -559,6 +560,14 @@ cat fs_gt.inflecting.tmp1.tagged | grep '\(+N:\)\|\(+N+Usage\)' \
| sed '/^killustik+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^kapsas+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^nälg+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^liist+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^matš+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^prill+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^suum+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^süsi+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^termostaat+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^tsitrus+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^tuhar+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^kameeleon+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
| sed '/^sari+N.*PIIM/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Nom@\1:@P.Stem.Nom@\2\3/' \
\
Expand All @@ -574,11 +583,6 @@ cat nouns.proto1 \
| sed -f nomstem_first_eit.sed \
\
| sed '/;.*mnocompound/s/^\([^:]*+N[^:]*\):\([^;]*;\)\(.*\)mnocompound/@P.Stem.Single@\1:@P.Stem.Single@\2\3/' \
| sed '/^vana+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Final@\1:@P.Stem.Final@\2\3/' \
| sed '/^alam+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Final@\1:@P.Stem.Final@\2\3/' \
| sed '/@alam+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Final@\1:@P.Stem.Final@\2\3/' \
| sed '/^ülem+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Final@\1:@P.Stem.Final@\2\3/' \
| sed '/@ülem+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Final@\1:@P.Stem.Final@\2\3/' \
\
| sed '/^pee+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Single@\1:@P.Stem.Single@\2\3/' \
| sed '/^aar+/s/^\([^:]*+N\):\([^;]*;\)\(.*\)/@P.Stem.Single@\1:@P.Stem.Single@\2\3/' \
Expand Down

0 comments on commit ba18a4d

Please sign in to comment.