| 1 | #!/usr/bin/env python2
 | 
| 2 | """spelling.py.
 | 
| 3 | 
 | 
| 4 | Filter the output of 'lynx -dump' into a list of words to spell check.
 | 
| 5 | """
 | 
| 6 | from __future__ import print_function
 | 
| 7 | 
 | 
| 8 | from collections import Counter
 | 
| 9 | import optparse
 | 
| 10 | import re
 | 
| 11 | import sys
 | 
| 12 | 
 | 
| 13 | from doctools.util import log
 | 
| 14 | 
 | 
| 15 | 
 | 
| 16 | def SplitWords(contents):
 | 
| 17 |     # Remove URLs so path components don't show up as words
 | 
| 18 |     contents = re.sub(r'(http|https|file)://\S+', '', contents)
 | 
| 19 | 
 | 
| 20 |     # Take into account contractions with apostrophes
 | 
| 21 |     #
 | 
| 22 |     # - doesn't
 | 
| 23 |     # - can't
 | 
| 24 | 
 | 
| 25 |     WORD_RE = re.compile(
 | 
| 26 |         r'''
 | 
| 27 |   [a-zA-Z]+
 | 
| 28 |   (?:\'t\b)?  # optional contraction
 | 
| 29 |   ''', re.VERBOSE)
 | 
| 30 | 
 | 
| 31 |     words = WORD_RE.findall(contents)
 | 
| 32 | 
 | 
| 33 |     for w in words:
 | 
| 34 |         yield w
 | 
| 35 | 
 | 
| 36 | 
 | 
| 37 | def WordList(f):
 | 
| 38 |     for line in f:
 | 
| 39 |         # no special characters allowed
 | 
| 40 |         yield line.strip()
 | 
| 41 | 
 | 
| 42 | 
 | 
| 43 | def Options():
 | 
| 44 |     """Returns an option parser instance."""
 | 
| 45 |     p = optparse.OptionParser()
 | 
| 46 |     p.add_option('--known-words',
 | 
| 47 |                  dest='known_words',
 | 
| 48 |                  help='List of words like /usr/share/dict/words')
 | 
| 49 |     p.add_option(
 | 
| 50 |         '--more-than-bash',
 | 
| 51 |         dest='more_than_bash',
 | 
| 52 |         type=int,
 | 
| 53 |         default=0,
 | 
| 54 |         help=
 | 
| 55 |         'Expected number of cases where OSH starts more processes than bash')
 | 
| 56 |     return p
 | 
| 57 | 
 | 
| 58 | 
 | 
| 59 | def main(argv):
 | 
| 60 |     o = Options()
 | 
| 61 |     opts, argv = o.parse_args(argv[1:])
 | 
| 62 | 
 | 
| 63 |     action = argv[0]
 | 
| 64 | 
 | 
| 65 |     if action == 'word-split':
 | 
| 66 |         contents = sys.stdin.read()
 | 
| 67 |         for w in SplitWords(contents):
 | 
| 68 |             print(w)
 | 
| 69 | 
 | 
| 70 |     elif action == 'check':
 | 
| 71 |         word_files = argv[1:]
 | 
| 72 | 
 | 
| 73 |         d = Counter()
 | 
| 74 | 
 | 
| 75 |         for path in word_files:
 | 
| 76 |             with open(path) as f:
 | 
| 77 |                 for word in WordList(f):
 | 
| 78 |                     d[word] += 1
 | 
| 79 | 
 | 
| 80 |         print('')
 | 
| 81 |         print('Most common words')
 | 
| 82 |         print('')
 | 
| 83 |         for word, count in d.most_common()[:20]:
 | 
| 84 |             print('%10d %s' % (count, word))
 | 
| 85 | 
 | 
| 86 |         print('')
 | 
| 87 |         print('Least common words')
 | 
| 88 |         print('')
 | 
| 89 |         for word, count in d.most_common()[-20:]:
 | 
| 90 |             print('%10d %s' % (count, word))
 | 
| 91 | 
 | 
| 92 |         log('%d word files', len(word_files))
 | 
| 93 |         log('%d unique words', len(d))
 | 
| 94 | 
 | 
| 95 |         known_words = {}
 | 
| 96 |         with open(opts.known_words) as f:
 | 
| 97 |             for w in WordList(f):
 | 
| 98 |                 known_words[w] = True
 | 
| 99 | 
 | 
| 100 |         print('')
 | 
| 101 |         print('Potential Misspellings')
 | 
| 102 |         print('')
 | 
| 103 | 
 | 
| 104 |         for path in word_files:
 | 
| 105 | 
 | 
| 106 |             print()
 | 
| 107 |             print('\t%s' % path)
 | 
| 108 |             print()
 | 
| 109 | 
 | 
| 110 |             with open(path) as f:
 | 
| 111 |                 unknown = {}
 | 
| 112 |                 for w in WordList(f):
 | 
| 113 |                     #if d.get(word) == 1:
 | 
| 114 |                     #  print(word)
 | 
| 115 |                     if w.lower() not in known_words:
 | 
| 116 |                         unknown[w] = True
 | 
| 117 | 
 | 
| 118 |                 if unknown:
 | 
| 119 |                     for u in sorted(unknown):
 | 
| 120 |                         # only occurs once
 | 
| 121 |                         if d.get(u) == 1:
 | 
| 122 |                             print(u)
 | 
| 123 |                     log('\t%d unknown words in %s', len(unknown), path)
 | 
| 124 | 
 | 
| 125 |         # Checking algorithms:
 | 
| 126 |         #
 | 
| 127 |         # - Does it appear in the dictionary?  Problem: most computer terms
 | 
| 128 |         # - Does it appear only once or twice in the whole corpus?
 | 
| 129 |         # - Is the edit distance very close to a dictinoary word?
 | 
| 130 |         #   - e.g. substitutions is a typo
 | 
| 131 | 
 | 
| 132 |     else:
 | 
| 133 |         raise RuntimeError('Invalid action %r' % action)
 | 
| 134 | 
 | 
| 135 | 
 | 
| 136 | if __name__ == '__main__':
 | 
| 137 |     try:
 | 
| 138 |         main(sys.argv)
 | 
| 139 |     except RuntimeError as e:
 | 
| 140 |         print('FATAL: %s' % e, file=sys.stderr)
 | 
| 141 |         sys.exit(1)
 |