Document (#39920)

Author
Cruys, T. van de
Moirón, B.V.
Title
Semantics-based multiword expression extraction
Source
Proceedings of the Workshop on A Broader Perspective on Multiword Expressions, Prag 2007
Imprint
Prag : Association for Computational Linguistics
Year
2007
Pages
S.25-32
Abstract
This paper describes a fully unsupervised and automated method for large-scale extraction of multiword expressions (MWEs) from large corpora. The method aims at capturing the non-compositionality of MWEs; the intuition is that a noun within a MWE cannot easily be replaced by a semantically similar noun. To implement this intuition, a noun clustering is automatically extracted (using distributional similarity measures), which gives us clusters of semantically related nouns. Next, a number of statistical measures - based on selectional preferences - is developed that formalize the intuition of non-compositionality. Our approach has been tested on Dutch, and automatically evaluated using Dutch lexical resources.
Theme
Computerlinguistik

Similar documents (content)

  1. Nagy T., I.: Detecting multiword expressions and named entities in natural language texts (2014) 0.38
    0.38224712 = sum of:
      0.38224712 = product of:
        0.9556178 = sum of:
          0.030996917 = weight(abstract_txt:lexical in 2536) [ClassicSimilarity], result of:
            0.030996917 = score(doc=2536,freq=2.0), product of:
              0.08591419 = queryWeight, product of:
                1.0017332 = boost
                6.5309834 = idf(docFreq=175, maxDocs=44421)
                0.013132103 = queryNorm
              0.36078927 = fieldWeight in 2536, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                6.5309834 = idf(docFreq=175, maxDocs=44421)
                0.0390625 = fieldNorm(doc=2536)
          0.022389844 = weight(abstract_txt:expression in 2536) [ClassicSimilarity], result of:
            0.022389844 = score(doc=2536,freq=1.0), product of:
              0.08714248 = queryWeight, product of:
                1.0088685 = boost
                6.5775037 = idf(docFreq=167, maxDocs=44421)
                0.013132103 = queryNorm
              0.25693375 = fieldWeight in 2536, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.5775037 = idf(docFreq=167, maxDocs=44421)
                0.0390625 = fieldNorm(doc=2536)
          0.069404475 = weight(abstract_txt:expressions in 2536) [ClassicSimilarity], result of:
            0.069404475 = score(doc=2536,freq=8.0), product of:
              0.09263126 = queryWeight, product of:
                1.0401558 = boost
                6.7814865 = idf(docFreq=136, maxDocs=44421)
                0.013132103 = queryNorm
              0.7492555 = fieldWeight in 2536, product of:
                2.828427 = tf(freq=8.0), with freq of:
                  8.0 = termFreq=8.0
                6.7814865 = idf(docFreq=136, maxDocs=44421)
                0.0390625 = fieldNorm(doc=2536)
          0.006500507 = weight(abstract_txt:using in 2536) [ClassicSimilarity], result of:
            0.006500507 = score(doc=2536,freq=1.0), product of:
              0.048139773 = queryWeight, product of:
                1.0604413 = boost
                3.4568708 = idf(docFreq=3806, maxDocs=44421)
                0.013132103 = queryNorm
              0.13503401 = fieldWeight in 2536, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4568708 = idf(docFreq=3806, maxDocs=44421)
                0.0390625 = fieldNorm(doc=2536)
          0.03203869 = weight(abstract_txt:method in 2536) [ClassicSimilarity], result of:
            0.03203869 = score(doc=2536,freq=5.0), product of:
              0.0815328 = queryWeight, product of:
                1.3800689 = boost
                4.4988065 = idf(docFreq=1342, maxDocs=44421)
                0.013132103 = queryNorm
              0.3929546 = fieldWeight in 2536, product of:
                2.236068 = tf(freq=5.0), with freq of:
                  5.0 = termFreq=5.0
                4.4988065 = idf(docFreq=1342, maxDocs=44421)
                0.0390625 = fieldNorm(doc=2536)
          0.0374604 = weight(abstract_txt:automatically in 2536) [ClassicSimilarity], result of:
            0.0374604 = score(doc=2536,freq=2.0), product of:
              0.12281296 = queryWeight, product of:
                1.6937788 = boost
                5.521451 = idf(docFreq=482, maxDocs=44421)
                0.013132103 = queryNorm
              0.30501994 = fieldWeight in 2536, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                5.521451 = idf(docFreq=482, maxDocs=44421)
                0.0390625 = fieldNorm(doc=2536)
          0.06470948 = weight(abstract_txt:extraction in 2536) [ClassicSimilarity], result of:
            0.06470948 = score(doc=2536,freq=3.0), product of:
              0.15445812 = queryWeight, product of:
                1.8995029 = boost
                6.192079 = idf(docFreq=246, maxDocs=44421)
                0.013132103 = queryNorm
              0.41894513 = fieldWeight in 2536, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                6.192079 = idf(docFreq=246, maxDocs=44421)
                0.0390625 = fieldNorm(doc=2536)
          0.33860096 = weight(abstract_txt:multiword in 2536) [ClassicSimilarity], result of:
            0.33860096 = score(doc=2536,freq=11.0), product of:
              0.30190277 = queryWeight, product of:
                2.6556334 = boost
                8.656945 = idf(docFreq=20, maxDocs=44421)
                0.013132103 = queryNorm
              1.1215563 = fieldWeight in 2536, product of:
                3.3166249 = tf(freq=11.0), with freq of:
                  11.0 = termFreq=11.0
                8.656945 = idf(docFreq=20, maxDocs=44421)
                0.0390625 = fieldNorm(doc=2536)
          0.24280521 = weight(abstract_txt:mwes in 2536) [ClassicSimilarity], result of:
            0.24280521 = score(doc=2536,freq=3.0), product of:
              0.37296733 = queryWeight, product of:
                2.951685 = boost
                9.622026 = idf(docFreq=7, maxDocs=44421)
                0.013132103 = queryNorm
              0.6510093 = fieldWeight in 2536, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                9.622026 = idf(docFreq=7, maxDocs=44421)
                0.0390625 = fieldNorm(doc=2536)
          0.11071132 = weight(abstract_txt:noun in 2536) [ClassicSimilarity], result of:
            0.11071132 = score(doc=2536,freq=1.0), product of:
              0.36477998 = queryWeight, product of:
                3.575162 = boost
                7.769642 = idf(docFreq=50, maxDocs=44421)
                0.013132103 = queryNorm
              0.30350164 = fieldWeight in 2536, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                7.769642 = idf(docFreq=50, maxDocs=44421)
                0.0390625 = fieldNorm(doc=2536)
        0.4 = coord(10/25)
    
  2. Snajder, J.; Almic, P.: Modeling semantic compositionality of Croatian multiword expressions (2015) 0.35
    0.34894344 = sum of:
      0.34894344 = product of:
        1.7447171 = sum of:
          0.058891654 = weight(abstract_txt:expressions in 3920) [ClassicSimilarity], result of:
            0.058891654 = score(doc=3920,freq=1.0), product of:
              0.09263126 = queryWeight, product of:
                1.0401558 = boost
                6.7814865 = idf(docFreq=136, maxDocs=44421)
                0.013132103 = queryNorm
              0.63576436 = fieldWeight in 3920, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.7814865 = idf(docFreq=136, maxDocs=44421)
                0.09375 = fieldNorm(doc=3920)
          0.15678616 = weight(abstract_txt:distributional in 3920) [ClassicSimilarity], result of:
            0.15678616 = score(doc=3920,freq=1.0), product of:
              0.17793453 = queryWeight, product of:
                1.4416164 = boost
                9.398883 = idf(docFreq=9, maxDocs=44421)
                0.013132103 = queryNorm
              0.88114524 = fieldWeight in 3920, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                9.398883 = idf(docFreq=9, maxDocs=44421)
                0.09375 = fieldNorm(doc=3920)
          0.24502087 = weight(abstract_txt:multiword in 3920) [ClassicSimilarity], result of:
            0.24502087 = score(doc=3920,freq=1.0), product of:
              0.30190277 = queryWeight, product of:
                2.6556334 = boost
                8.656945 = idf(docFreq=20, maxDocs=44421)
                0.013132103 = queryNorm
              0.81158864 = fieldWeight in 3920, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                8.656945 = idf(docFreq=20, maxDocs=44421)
                0.09375 = fieldNorm(doc=3920)
          0.58273244 = weight(abstract_txt:mwes in 3920) [ClassicSimilarity], result of:
            0.58273244 = score(doc=3920,freq=3.0), product of:
              0.37296733 = queryWeight, product of:
                2.951685 = boost
                9.622026 = idf(docFreq=7, maxDocs=44421)
                0.013132103 = queryNorm
              1.5624223 = fieldWeight in 3920, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                9.622026 = idf(docFreq=7, maxDocs=44421)
                0.09375 = fieldNorm(doc=3920)
          0.7012861 = weight(abstract_txt:compositionality in 3920) [ClassicSimilarity], result of:
            0.7012861 = score(doc=3920,freq=4.0), product of:
              0.38339096 = queryWeight, product of:
                2.9926472 = boost
                9.755557 = idf(docFreq=6, maxDocs=44421)
                0.013132103 = queryNorm
              1.8291669 = fieldWeight in 3920, product of:
                2.0 = tf(freq=4.0), with freq of:
                  4.0 = termFreq=4.0
                9.755557 = idf(docFreq=6, maxDocs=44421)
                0.09375 = fieldNorm(doc=3920)
        0.2 = coord(5/25)
    
  3. Nissim, M.; Zaninello, A,: Modeling the internal variability of multiword expressions through a pattern-based method (2013) 0.27
    0.269063 = sum of:
      0.269063 = product of:
        0.84082186 = sum of:
          0.039261103 = weight(abstract_txt:expressions in 1990) [ClassicSimilarity], result of:
            0.039261103 = score(doc=1990,freq=1.0), product of:
              0.09263126 = queryWeight, product of:
                1.0401558 = boost
                6.7814865 = idf(docFreq=136, maxDocs=44421)
                0.013132103 = queryNorm
              0.4238429 = fieldWeight in 1990, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.7814865 = idf(docFreq=136, maxDocs=44421)
                0.0625 = fieldNorm(doc=1990)
          0.010400811 = weight(abstract_txt:using in 1990) [ClassicSimilarity], result of:
            0.010400811 = score(doc=1990,freq=1.0), product of:
              0.048139773 = queryWeight, product of:
                1.0604413 = boost
                3.4568708 = idf(docFreq=3806, maxDocs=44421)
                0.013132103 = queryNorm
              0.21605442 = fieldWeight in 1990, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4568708 = idf(docFreq=3806, maxDocs=44421)
                0.0625 = fieldNorm(doc=1990)
          0.022072751 = weight(abstract_txt:large in 1990) [ClassicSimilarity], result of:
            0.022072751 = score(doc=1990,freq=1.0), product of:
              0.07949934 = queryWeight, product of:
                1.3627505 = boost
                4.4423513 = idf(docFreq=1420, maxDocs=44421)
                0.013132103 = queryNorm
              0.27764696 = fieldWeight in 1990, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                4.4423513 = idf(docFreq=1420, maxDocs=44421)
                0.0625 = fieldNorm(doc=1990)
          0.03242087 = weight(abstract_txt:method in 1990) [ClassicSimilarity], result of:
            0.03242087 = score(doc=1990,freq=2.0), product of:
              0.0815328 = queryWeight, product of:
                1.3800689 = boost
                4.4988065 = idf(docFreq=1342, maxDocs=44421)
                0.013132103 = queryNorm
              0.39764208 = fieldWeight in 1990, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                4.4988065 = idf(docFreq=1342, maxDocs=44421)
                0.0625 = fieldNorm(doc=1990)
          0.040195253 = weight(abstract_txt:measures in 1990) [ClassicSimilarity], result of:
            0.040195253 = score(doc=1990,freq=1.0), product of:
              0.11855205 = queryWeight, product of:
                1.6641372 = boost
                5.424824 = idf(docFreq=531, maxDocs=44421)
                0.013132103 = queryNorm
              0.3390515 = fieldWeight in 1990, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.424824 = idf(docFreq=531, maxDocs=44421)
                0.0625 = fieldNorm(doc=1990)
          0.084536105 = weight(abstract_txt:extraction in 1990) [ClassicSimilarity], result of:
            0.084536105 = score(doc=1990,freq=2.0), product of:
              0.15445812 = queryWeight, product of:
                1.8995029 = boost
                6.192079 = idf(docFreq=246, maxDocs=44421)
                0.013132103 = queryNorm
              0.5473076 = fieldWeight in 1990, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                6.192079 = idf(docFreq=246, maxDocs=44421)
                0.0625 = fieldNorm(doc=1990)
          0.16334723 = weight(abstract_txt:multiword in 1990) [ClassicSimilarity], result of:
            0.16334723 = score(doc=1990,freq=1.0), product of:
              0.30190277 = queryWeight, product of:
                2.6556334 = boost
                8.656945 = idf(docFreq=20, maxDocs=44421)
                0.013132103 = queryNorm
              0.5410591 = fieldWeight in 1990, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                8.656945 = idf(docFreq=20, maxDocs=44421)
                0.0625 = fieldNorm(doc=1990)
          0.4485877 = weight(abstract_txt:mwes in 1990) [ClassicSimilarity], result of:
            0.4485877 = score(doc=1990,freq=4.0), product of:
              0.37296733 = queryWeight, product of:
                2.951685 = boost
                9.622026 = idf(docFreq=7, maxDocs=44421)
                0.013132103 = queryNorm
              1.2027533 = fieldWeight in 1990, product of:
                2.0 = tf(freq=4.0), with freq of:
                  4.0 = termFreq=4.0
                9.622026 = idf(docFreq=7, maxDocs=44421)
                0.0625 = fieldNorm(doc=1990)
        0.32 = coord(8/25)
    
  4. Vechtomova, O.: ¬A method for automatic extraction of multiword units representing business aspects from user reviews (2014) 0.17
    0.17424735 = sum of:
      0.17424735 = product of:
        0.72603065 = sum of:
          0.013001014 = weight(abstract_txt:using in 2304) [ClassicSimilarity], result of:
            0.013001014 = score(doc=2304,freq=1.0), product of:
              0.048139773 = queryWeight, product of:
                1.0604413 = boost
                3.4568708 = idf(docFreq=3806, maxDocs=44421)
                0.013132103 = queryNorm
              0.27006802 = fieldWeight in 2304, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4568708 = idf(docFreq=3806, maxDocs=44421)
                0.078125 = fieldNorm(doc=2304)
          0.04963412 = weight(abstract_txt:method in 2304) [ClassicSimilarity], result of:
            0.04963412 = score(doc=2304,freq=3.0), product of:
              0.0815328 = queryWeight, product of:
                1.3800689 = boost
                4.4988065 = idf(docFreq=1342, maxDocs=44421)
                0.013132103 = queryNorm
              0.6087626 = fieldWeight in 2304, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                4.4988065 = idf(docFreq=1342, maxDocs=44421)
                0.078125 = fieldNorm(doc=2304)
          0.18477426 = weight(abstract_txt:distributional in 2304) [ClassicSimilarity], result of:
            0.18477426 = score(doc=2304,freq=2.0), product of:
              0.17793453 = queryWeight, product of:
                1.4416164 = boost
                9.398883 = idf(docFreq=9, maxDocs=44421)
                0.013132103 = queryNorm
              1.0384396 = fieldWeight in 2304, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                9.398883 = idf(docFreq=9, maxDocs=44421)
                0.078125 = fieldNorm(doc=2304)
          0.050244063 = weight(abstract_txt:measures in 2304) [ClassicSimilarity], result of:
            0.050244063 = score(doc=2304,freq=1.0), product of:
              0.11855205 = queryWeight, product of:
                1.6641372 = boost
                5.424824 = idf(docFreq=531, maxDocs=44421)
                0.013132103 = queryNorm
              0.4238144 = fieldWeight in 2304, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.424824 = idf(docFreq=531, maxDocs=44421)
                0.078125 = fieldNorm(doc=2304)
          0.07472007 = weight(abstract_txt:extraction in 2304) [ClassicSimilarity], result of:
            0.07472007 = score(doc=2304,freq=1.0), product of:
              0.15445812 = queryWeight, product of:
                1.8995029 = boost
                6.192079 = idf(docFreq=246, maxDocs=44421)
                0.013132103 = queryNorm
              0.48375618 = fieldWeight in 2304, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.192079 = idf(docFreq=246, maxDocs=44421)
                0.078125 = fieldNorm(doc=2304)
          0.35365713 = weight(abstract_txt:multiword in 2304) [ClassicSimilarity], result of:
            0.35365713 = score(doc=2304,freq=3.0), product of:
              0.30190277 = queryWeight, product of:
                2.6556334 = boost
                8.656945 = idf(docFreq=20, maxDocs=44421)
                0.013132103 = queryNorm
              1.1714272 = fieldWeight in 2304, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                8.656945 = idf(docFreq=20, maxDocs=44421)
                0.078125 = fieldNorm(doc=2304)
        0.24 = coord(6/25)
    
  5. Ramisch, C.; Schreiner, P.; Idiart, M.; Villavicencio, A.: ¬An evaluation of methods for the extraction of multiword expressions (20xx) 0.16
    0.15680084 = sum of:
      0.15680084 = product of:
        0.98000526 = sum of:
          0.06870693 = weight(abstract_txt:expressions in 1962) [ClassicSimilarity], result of:
            0.06870693 = score(doc=1962,freq=1.0), product of:
              0.09263126 = queryWeight, product of:
                1.0401558 = boost
                6.7814865 = idf(docFreq=136, maxDocs=44421)
                0.013132103 = queryNorm
              0.7417251 = fieldWeight in 1962, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.7814865 = idf(docFreq=136, maxDocs=44421)
                0.109375 = fieldNorm(doc=1962)
          0.07034169 = weight(abstract_txt:measures in 1962) [ClassicSimilarity], result of:
            0.07034169 = score(doc=1962,freq=1.0), product of:
              0.11855205 = queryWeight, product of:
                1.6641372 = boost
                5.424824 = idf(docFreq=531, maxDocs=44421)
                0.013132103 = queryNorm
              0.59334016 = fieldWeight in 1962, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.424824 = idf(docFreq=531, maxDocs=44421)
                0.109375 = fieldNorm(doc=1962)
          0.28585768 = weight(abstract_txt:multiword in 1962) [ClassicSimilarity], result of:
            0.28585768 = score(doc=1962,freq=1.0), product of:
              0.30190277 = queryWeight, product of:
                2.6556334 = boost
                8.656945 = idf(docFreq=20, maxDocs=44421)
                0.013132103 = queryNorm
              0.9468534 = fieldWeight in 1962, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                8.656945 = idf(docFreq=20, maxDocs=44421)
                0.109375 = fieldNorm(doc=1962)
          0.55509895 = weight(abstract_txt:mwes in 1962) [ClassicSimilarity], result of:
            0.55509895 = score(doc=1962,freq=2.0), product of:
              0.37296733 = queryWeight, product of:
                2.951685 = boost
                9.622026 = idf(docFreq=7, maxDocs=44421)
                0.013132103 = queryNorm
              1.4883313 = fieldWeight in 1962, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                9.622026 = idf(docFreq=7, maxDocs=44421)
                0.109375 = fieldNorm(doc=1962)
        0.16 = coord(4/25)