Document (#7227)

Author
Taghva, K.
Title
¬The effects of noisy data on text retrieval
Source
Journal of the American Society for Information Science. 45(1994) no.1, S.50-58
Year
1994
Abstract
Reports of the results of experiments on query evaluation on the presence of noisy data, in particular, an OCR-generated database and its corresponding 99.8 % correct version are used to process a set of queries to determine the effect the degraded version will have on retrieval. With the set of scientific documents used in the testing, the effect is insignificant. Improves the result by applying an automatic postprocessing system designed to correct the kinds of errors generated by recognition devices
Theme
Retrievalstudien

Similar documents (content)

  1. Tagheva, K.; Borsack, J.; Condit, A.: Effects of OCR errors on ranking and feedback using the vector space model (1996) 0.23
    0.22769737 = sum of:
      0.22769737 = product of:
        0.8132049 = sum of:
          0.08775372 = weight(abstract_txt:recognition in 5019) [ClassicSimilarity], result of:
            0.08775372 = score(doc=5019,freq=1.0), product of:
              0.13122195 = queryWeight, product of:
                1.1984911 = boost
                6.114219 = idf(docFreq=266, maxDocs=44421)
                0.017907323 = queryNorm
              0.6687427 = fieldWeight in 5019, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.114219 = idf(docFreq=266, maxDocs=44421)
                0.109375 = fieldNorm(doc=5019)
          0.10248323 = weight(abstract_txt:presence in 5019) [ClassicSimilarity], result of:
            0.10248323 = score(doc=5019,freq=1.0), product of:
              0.14552289 = queryWeight, product of:
                1.2621101 = boost
                6.4387774 = idf(docFreq=192, maxDocs=44421)
                0.017907323 = queryNorm
              0.7042413 = fieldWeight in 5019, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.4387774 = idf(docFreq=192, maxDocs=44421)
                0.109375 = fieldNorm(doc=5019)
          0.15244688 = weight(abstract_txt:errors in 5019) [ClassicSimilarity], result of:
            0.15244688 = score(doc=5019,freq=2.0), product of:
              0.15050994 = queryWeight, product of:
                1.2835541 = boost
                6.548176 = idf(docFreq=172, maxDocs=44421)
                0.017907323 = queryNorm
              1.0128692 = fieldWeight in 5019, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                6.548176 = idf(docFreq=172, maxDocs=44421)
                0.109375 = fieldNorm(doc=5019)
          0.029053887 = weight(abstract_txt:used in 5019) [ClassicSimilarity], result of:
            0.029053887 = score(doc=5019,freq=1.0), product of:
              0.07912413 = queryWeight, product of:
                1.3161368 = boost
                3.3572001 = idf(docFreq=4205, maxDocs=44421)
                0.017907323 = queryNorm
              0.36719376 = fieldWeight in 5019, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.3572001 = idf(docFreq=4205, maxDocs=44421)
                0.109375 = fieldNorm(doc=5019)
          0.116395354 = weight(abstract_txt:improves in 5019) [ClassicSimilarity], result of:
            0.116395354 = score(doc=5019,freq=1.0), product of:
              0.15841144 = queryWeight, product of:
                1.3168153 = boost
                6.717861 = idf(docFreq=145, maxDocs=44421)
                0.017907323 = queryNorm
              0.73476607 = fieldWeight in 5019, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.717861 = idf(docFreq=145, maxDocs=44421)
                0.109375 = fieldNorm(doc=5019)
          0.032262594 = weight(abstract_txt:retrieval in 5019) [ClassicSimilarity], result of:
            0.032262594 = score(doc=5019,freq=1.0), product of:
              0.08484748 = queryWeight, product of:
                1.3629065 = boost
                3.4765 = idf(docFreq=3732, maxDocs=44421)
                0.017907323 = queryNorm
              0.3802422 = fieldWeight in 5019, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4765 = idf(docFreq=3732, maxDocs=44421)
                0.109375 = fieldNorm(doc=5019)
          0.29280922 = weight(abstract_txt:degraded in 5019) [ClassicSimilarity], result of:
            0.29280922 = score(doc=5019,freq=1.0), product of:
              0.29301238 = queryWeight, product of:
                1.7909133 = boost
                9.1365185 = idf(docFreq=12, maxDocs=44421)
                0.017907323 = queryNorm
              0.9993067 = fieldWeight in 5019, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                9.1365185 = idf(docFreq=12, maxDocs=44421)
                0.109375 = fieldNorm(doc=5019)
        0.28 = coord(7/25)
    
  2. Li, D.; Tang, J.; Ding, Y.; Shuai, X.; Chambers, T.; Sun, G.; Luo, Z.; Zhang, J.: Topic-level opinion influence model (TOIM) : an investigation using tencent microblogging (2015) 0.11
    0.10755949 = sum of:
      0.10755949 = product of:
        0.44816455 = sum of:
          0.03312073 = weight(abstract_txt:experiments in 3345) [ClassicSimilarity], result of:
            0.03312073 = score(doc=3345,freq=1.0), product of:
              0.09952253 = queryWeight, product of:
                1.0437399 = boost
                5.324741 = idf(docFreq=587, maxDocs=44421)
                0.017907323 = queryNorm
              0.3327963 = fieldWeight in 3345, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.324741 = idf(docFreq=587, maxDocs=44421)
                0.0625 = fieldNorm(doc=3345)
          0.01620524 = weight(abstract_txt:data in 3345) [ClassicSimilarity], result of:
            0.01620524 = score(doc=3345,freq=1.0), product of:
              0.07785774 = queryWeight, product of:
                1.3055619 = boost
                3.3302255 = idf(docFreq=4320, maxDocs=44421)
                0.017907323 = queryNorm
              0.20813909 = fieldWeight in 3345, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.3302255 = idf(docFreq=4320, maxDocs=44421)
                0.0625 = fieldNorm(doc=3345)
          0.016602222 = weight(abstract_txt:used in 3345) [ClassicSimilarity], result of:
            0.016602222 = score(doc=3345,freq=1.0), product of:
              0.07912413 = queryWeight, product of:
                1.3161368 = boost
                3.3572001 = idf(docFreq=4205, maxDocs=44421)
                0.017907323 = queryNorm
              0.20982501 = fieldWeight in 3345, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.3572001 = idf(docFreq=4205, maxDocs=44421)
                0.0625 = fieldNorm(doc=3345)
          0.06651163 = weight(abstract_txt:improves in 3345) [ClassicSimilarity], result of:
            0.06651163 = score(doc=3345,freq=1.0), product of:
              0.15841144 = queryWeight, product of:
                1.3168153 = boost
                6.717861 = idf(docFreq=145, maxDocs=44421)
                0.017907323 = queryNorm
              0.41986632 = fieldWeight in 3345, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.717861 = idf(docFreq=145, maxDocs=44421)
                0.0625 = fieldNorm(doc=3345)
          0.0733631 = weight(abstract_txt:generated in 3345) [ClassicSimilarity], result of:
            0.0733631 = score(doc=3345,freq=1.0), product of:
              0.2130672 = queryWeight, product of:
                2.159757 = boost
                5.509105 = idf(docFreq=488, maxDocs=44421)
                0.017907323 = queryNorm
              0.34431908 = fieldWeight in 3345, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.509105 = idf(docFreq=488, maxDocs=44421)
                0.0625 = fieldNorm(doc=3345)
          0.24236163 = weight(abstract_txt:noisy in 3345) [ClassicSimilarity], result of:
            0.24236163 = score(doc=3345,freq=1.0), product of:
              0.47261488 = queryWeight, product of:
                3.2166238 = boost
                8.20496 = idf(docFreq=32, maxDocs=44421)
                0.017907323 = queryNorm
              0.51281 = fieldWeight in 3345, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                8.20496 = idf(docFreq=32, maxDocs=44421)
                0.0625 = fieldNorm(doc=3345)
        0.24 = coord(6/25)
    
  3. Beall, J.; Kafadar, K.: Measuring typographical errors' impact on retrieval in bibliographic databases (2007) 0.11
    0.10523059 = sum of:
      0.10523059 = product of:
        0.52615297 = sum of:
          0.073202305 = weight(abstract_txt:presence in 386) [ClassicSimilarity], result of:
            0.073202305 = score(doc=386,freq=1.0), product of:
              0.14552289 = queryWeight, product of:
                1.2621101 = boost
                6.4387774 = idf(docFreq=192, maxDocs=44421)
                0.017907323 = queryNorm
              0.50302947 = fieldWeight in 386, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.4387774 = idf(docFreq=192, maxDocs=44421)
                0.078125 = fieldNorm(doc=386)
          0.10889064 = weight(abstract_txt:errors in 386) [ClassicSimilarity], result of:
            0.10889064 = score(doc=386,freq=2.0), product of:
              0.15050994 = queryWeight, product of:
                1.2835541 = boost
                6.548176 = idf(docFreq=172, maxDocs=44421)
                0.017907323 = queryNorm
              0.7234781 = fieldWeight in 386, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                6.548176 = idf(docFreq=172, maxDocs=44421)
                0.078125 = fieldNorm(doc=386)
          0.023044707 = weight(abstract_txt:retrieval in 386) [ClassicSimilarity], result of:
            0.023044707 = score(doc=386,freq=1.0), product of:
              0.08484748 = queryWeight, product of:
                1.3629065 = boost
                3.4765 = idf(docFreq=3732, maxDocs=44421)
                0.017907323 = queryNorm
              0.27160156 = fieldWeight in 386, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4765 = idf(docFreq=3732, maxDocs=44421)
                0.078125 = fieldNorm(doc=386)
          0.08728715 = weight(abstract_txt:effect in 386) [ClassicSimilarity], result of:
            0.08728715 = score(doc=386,freq=1.0), product of:
              0.20616978 = queryWeight, product of:
                2.1245115 = boost
                5.419201 = idf(docFreq=534, maxDocs=44421)
                0.017907323 = queryNorm
              0.42337507 = fieldWeight in 386, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.419201 = idf(docFreq=534, maxDocs=44421)
                0.078125 = fieldNorm(doc=386)
          0.23372819 = weight(abstract_txt:correct in 386) [ClassicSimilarity], result of:
            0.23372819 = score(doc=386,freq=2.0), product of:
              0.31554082 = queryWeight, product of:
                2.6282964 = boost
                6.704255 = idf(docFreq=147, maxDocs=44421)
                0.017907323 = queryNorm
              0.7407225 = fieldWeight in 386, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                6.704255 = idf(docFreq=147, maxDocs=44421)
                0.078125 = fieldNorm(doc=386)
        0.2 = coord(5/25)
    
  4. Taghva, K.; Borsack, J.; Condit, A.: Evaluation of model-based retrieval effectiveness with OCR text (1996) 0.10
    0.10171713 = sum of:
      0.10171713 = product of:
        0.50858563 = sum of:
          0.070259675 = weight(abstract_txt:experiments in 4553) [ClassicSimilarity], result of:
            0.070259675 = score(doc=4553,freq=2.0), product of:
              0.09952253 = queryWeight, product of:
                1.0437399 = boost
                5.324741 = idf(docFreq=587, maxDocs=44421)
                0.017907323 = queryNorm
              0.70596755 = fieldWeight in 4553, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                5.324741 = idf(docFreq=587, maxDocs=44421)
                0.09375 = fieldNorm(doc=4553)
          0.06735611 = weight(abstract_txt:applying in 4553) [ClassicSimilarity], result of:
            0.06735611 = score(doc=4553,freq=1.0), product of:
              0.12191168 = queryWeight, product of:
                1.1551921 = boost
                5.8933253 = idf(docFreq=332, maxDocs=44421)
                0.017907323 = queryNorm
              0.55249923 = fieldWeight in 4553, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.8933253 = idf(docFreq=332, maxDocs=44421)
                0.09375 = fieldNorm(doc=4553)
          0.1600359 = weight(abstract_txt:errors in 4553) [ClassicSimilarity], result of:
            0.1600359 = score(doc=4553,freq=3.0), product of:
              0.15050994 = queryWeight, product of:
                1.2835541 = boost
                6.548176 = idf(docFreq=172, maxDocs=44421)
                0.017907323 = queryNorm
              1.0632912 = fieldWeight in 4553, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                6.548176 = idf(docFreq=172, maxDocs=44421)
                0.09375 = fieldNorm(doc=4553)
          0.055307303 = weight(abstract_txt:retrieval in 4553) [ClassicSimilarity], result of:
            0.055307303 = score(doc=4553,freq=4.0), product of:
              0.08484748 = queryWeight, product of:
                1.3629065 = boost
                3.4765 = idf(docFreq=3732, maxDocs=44421)
                0.017907323 = queryNorm
              0.6518438 = fieldWeight in 4553, product of:
                2.0 = tf(freq=4.0), with freq of:
                  4.0 = termFreq=4.0
                3.4765 = idf(docFreq=3732, maxDocs=44421)
                0.09375 = fieldNorm(doc=4553)
          0.15562664 = weight(abstract_txt:generated in 4553) [ClassicSimilarity], result of:
            0.15562664 = score(doc=4553,freq=2.0), product of:
              0.2130672 = queryWeight, product of:
                2.159757 = boost
                5.509105 = idf(docFreq=488, maxDocs=44421)
                0.017907323 = queryNorm
              0.73041105 = fieldWeight in 4553, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                5.509105 = idf(docFreq=488, maxDocs=44421)
                0.09375 = fieldNorm(doc=4553)
        0.2 = coord(5/25)
    
  5. Tüür-Fröhlich, T.: ¬The non-trivial effects of trivial errors in scientific communication and evaluation (2016) 0.10
    0.100744456 = sum of:
      0.100744456 = product of:
        0.41976857 = sum of:
          0.14260101 = weight(abstract_txt:errors in 4137) [ClassicSimilarity], result of:
            0.14260101 = score(doc=4137,freq=7.0), product of:
              0.15050994 = queryWeight, product of:
                1.2835541 = boost
                6.548176 = idf(docFreq=172, maxDocs=44421)
                0.017907323 = queryNorm
              0.9474524 = fieldWeight in 4137, product of:
                2.6457512 = tf(freq=7.0), with freq of:
                  7.0 = termFreq=7.0
                6.548176 = idf(docFreq=172, maxDocs=44421)
                0.0546875 = fieldNorm(doc=4137)
          0.020052962 = weight(abstract_txt:data in 4137) [ClassicSimilarity], result of:
            0.020052962 = score(doc=4137,freq=2.0), product of:
              0.07785774 = queryWeight, product of:
                1.3055619 = boost
                3.3302255 = idf(docFreq=4320, maxDocs=44421)
                0.017907323 = queryNorm
              0.257559 = fieldWeight in 4137, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                3.3302255 = idf(docFreq=4320, maxDocs=44421)
                0.0546875 = fieldNorm(doc=4137)
          0.016131297 = weight(abstract_txt:retrieval in 4137) [ClassicSimilarity], result of:
            0.016131297 = score(doc=4137,freq=1.0), product of:
              0.08484748 = queryWeight, product of:
                1.3629065 = boost
                3.4765 = idf(docFreq=3732, maxDocs=44421)
                0.017907323 = queryNorm
              0.1901211 = fieldWeight in 4137, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4765 = idf(docFreq=3732, maxDocs=44421)
                0.0546875 = fieldNorm(doc=4137)
          0.061101004 = weight(abstract_txt:effect in 4137) [ClassicSimilarity], result of:
            0.061101004 = score(doc=4137,freq=1.0), product of:
              0.20616978 = queryWeight, product of:
                2.1245115 = boost
                5.419201 = idf(docFreq=534, maxDocs=44421)
                0.017907323 = queryNorm
              0.29636255 = fieldWeight in 4137, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.419201 = idf(docFreq=534, maxDocs=44421)
                0.0546875 = fieldNorm(doc=4137)
          0.06419271 = weight(abstract_txt:generated in 4137) [ClassicSimilarity], result of:
            0.06419271 = score(doc=4137,freq=1.0), product of:
              0.2130672 = queryWeight, product of:
                2.159757 = boost
                5.509105 = idf(docFreq=488, maxDocs=44421)
                0.017907323 = queryNorm
              0.3012792 = fieldWeight in 4137, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.509105 = idf(docFreq=488, maxDocs=44421)
                0.0546875 = fieldNorm(doc=4137)
          0.11568956 = weight(abstract_txt:correct in 4137) [ClassicSimilarity], result of:
            0.11568956 = score(doc=4137,freq=1.0), product of:
              0.31554082 = queryWeight, product of:
                2.6282964 = boost
                6.704255 = idf(docFreq=147, maxDocs=44421)
                0.017907323 = queryNorm
              0.36663896 = fieldWeight in 4137, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.704255 = idf(docFreq=147, maxDocs=44421)
                0.0546875 = fieldNorm(doc=4137)
        0.24 = coord(6/25)