| 1 | from __future__ import print_function  # for OPy compiler
 | 
| 2 | """Text wrapping and filling.
 | 
| 3 | """
 | 
| 4 | 
 | 
| 5 | # Copyright (C) 1999-2001 Gregory P. Ward.
 | 
| 6 | # Copyright (C) 2002, 2003 Python Software Foundation.
 | 
| 7 | # Written by Greg Ward <gward@python.net>
 | 
| 8 | 
 | 
| 9 | __revision__ = "$Id$"
 | 
| 10 | 
 | 
| 11 | import string, re
 | 
| 12 | 
 | 
| 13 | try:
 | 
| 14 |     _unicode = unicode
 | 
| 15 | except NameError:
 | 
| 16 |     # If Python is built without Unicode support, the unicode type
 | 
| 17 |     # will not exist. Fake one.
 | 
| 18 |     class _unicode(object):
 | 
| 19 |         pass
 | 
| 20 | 
 | 
| 21 | # Do the right thing with boolean values for all known Python versions
 | 
| 22 | # (so this module can be copied to projects that don't depend on Python
 | 
| 23 | # 2.3, e.g. Optik and Docutils) by uncommenting the block of code below.
 | 
| 24 | #try:
 | 
| 25 | #    True, False
 | 
| 26 | #except NameError:
 | 
| 27 | #    (True, False) = (1, 0)
 | 
| 28 | 
 | 
| 29 | __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent']
 | 
| 30 | 
 | 
| 31 | # Hardcode the recognized whitespace characters to the US-ASCII
 | 
| 32 | # whitespace characters.  The main reason for doing this is that in
 | 
| 33 | # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
 | 
| 34 | # that character winds up in string.whitespace.  Respecting
 | 
| 35 | # string.whitespace in those cases would 1) make textwrap treat 0xa0 the
 | 
| 36 | # same as any other whitespace char, which is clearly wrong (it's a
 | 
| 37 | # *non-breaking* space), 2) possibly cause problems with Unicode,
 | 
| 38 | # since 0xa0 is not in range(128).
 | 
| 39 | _whitespace = '\t\n\x0b\x0c\r '
 | 
| 40 | 
 | 
| 41 | class TextWrapper:
 | 
| 42 |     """
 | 
| 43 |     Object for wrapping/filling text.  The public interface consists of
 | 
| 44 |     the wrap() and fill() methods; the other methods are just there for
 | 
| 45 |     subclasses to override in order to tweak the default behaviour.
 | 
| 46 |     If you want to completely replace the main wrapping algorithm,
 | 
| 47 |     you'll probably have to override _wrap_chunks().
 | 
| 48 | 
 | 
| 49 |     Several instance attributes control various aspects of wrapping:
 | 
| 50 |       width (default: 70)
 | 
| 51 |         the maximum width of wrapped lines (unless break_long_words
 | 
| 52 |         is false)
 | 
| 53 |       initial_indent (default: "")
 | 
| 54 |         string that will be prepended to the first line of wrapped
 | 
| 55 |         output.  Counts towards the line's width.
 | 
| 56 |       subsequent_indent (default: "")
 | 
| 57 |         string that will be prepended to all lines save the first
 | 
| 58 |         of wrapped output; also counts towards each line's width.
 | 
| 59 |       expand_tabs (default: true)
 | 
| 60 |         Expand tabs in input text to spaces before further processing.
 | 
| 61 |         Each tab will become 1 .. 8 spaces, depending on its position in
 | 
| 62 |         its line.  If false, each tab is treated as a single character.
 | 
| 63 |       replace_whitespace (default: true)
 | 
| 64 |         Replace all whitespace characters in the input text by spaces
 | 
| 65 |         after tab expansion.  Note that if expand_tabs is false and
 | 
| 66 |         replace_whitespace is true, every tab will be converted to a
 | 
| 67 |         single space!
 | 
| 68 |       fix_sentence_endings (default: false)
 | 
| 69 |         Ensure that sentence-ending punctuation is always followed
 | 
| 70 |         by two spaces.  Off by default because the algorithm is
 | 
| 71 |         (unavoidably) imperfect.
 | 
| 72 |       break_long_words (default: true)
 | 
| 73 |         Break words longer than 'width'.  If false, those words will not
 | 
| 74 |         be broken, and some lines might be longer than 'width'.
 | 
| 75 |       break_on_hyphens (default: true)
 | 
| 76 |         Allow breaking hyphenated words. If true, wrapping will occur
 | 
| 77 |         preferably on whitespaces and right after hyphens part of
 | 
| 78 |         compound words.
 | 
| 79 |       drop_whitespace (default: true)
 | 
| 80 |         Drop leading and trailing whitespace from lines.
 | 
| 81 |     """
 | 
| 82 | 
 | 
| 83 |     whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
 | 
| 84 | 
 | 
| 85 |     unicode_whitespace_trans = {}
 | 
| 86 |     uspace = ord(u' ')
 | 
| 87 |     for x in map(ord, _whitespace):
 | 
| 88 |         unicode_whitespace_trans[x] = uspace
 | 
| 89 | 
 | 
| 90 |     # This funky little regex is just the trick for splitting
 | 
| 91 |     # text up into word-wrappable chunks.  E.g.
 | 
| 92 |     #   "Hello there -- you goof-ball, use the -b option!"
 | 
| 93 |     # splits into
 | 
| 94 |     #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
 | 
| 95 |     # (after stripping out empty strings).
 | 
| 96 |     wordsep_re = re.compile(
 | 
| 97 |         r'(\s+|'                                  # any whitespace
 | 
| 98 |         r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|'   # hyphenated words
 | 
| 99 |         r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
 | 
| 100 | 
 | 
| 101 |     # This less funky little regex just split on recognized spaces. E.g.
 | 
| 102 |     #   "Hello there -- you goof-ball, use the -b option!"
 | 
| 103 |     # splits into
 | 
| 104 |     #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
 | 
| 105 |     wordsep_simple_re = re.compile(r'(\s+)')
 | 
| 106 | 
 | 
| 107 |     # XXX this is not locale- or charset-aware -- string.lowercase
 | 
| 108 |     # is US-ASCII only (and therefore English-only)
 | 
| 109 |     sentence_end_re = re.compile(r'[%s]'              # lowercase letter
 | 
| 110 |                                  r'[\.\!\?]'          # sentence-ending punct.
 | 
| 111 |                                  r'[\"\']?'           # optional end-of-quote
 | 
| 112 |                                  r'\Z'                # end of chunk
 | 
| 113 |                                  % string.lowercase)
 | 
| 114 | 
 | 
| 115 | 
 | 
| 116 |     def __init__(self,
 | 
| 117 |                  width=70,
 | 
| 118 |                  initial_indent="",
 | 
| 119 |                  subsequent_indent="",
 | 
| 120 |                  expand_tabs=True,
 | 
| 121 |                  replace_whitespace=True,
 | 
| 122 |                  fix_sentence_endings=False,
 | 
| 123 |                  break_long_words=True,
 | 
| 124 |                  drop_whitespace=True,
 | 
| 125 |                  break_on_hyphens=True):
 | 
| 126 |         self.width = width
 | 
| 127 |         self.initial_indent = initial_indent
 | 
| 128 |         self.subsequent_indent = subsequent_indent
 | 
| 129 |         self.expand_tabs = expand_tabs
 | 
| 130 |         self.replace_whitespace = replace_whitespace
 | 
| 131 |         self.fix_sentence_endings = fix_sentence_endings
 | 
| 132 |         self.break_long_words = break_long_words
 | 
| 133 |         self.drop_whitespace = drop_whitespace
 | 
| 134 |         self.break_on_hyphens = break_on_hyphens
 | 
| 135 | 
 | 
| 136 |         # recompile the regexes for Unicode mode -- done in this clumsy way for
 | 
| 137 |         # backwards compatibility because it's rather common to monkey-patch
 | 
| 138 |         # the TextWrapper class' wordsep_re attribute.
 | 
| 139 |         self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
 | 
| 140 |         self.wordsep_simple_re_uni = re.compile(
 | 
| 141 |             self.wordsep_simple_re.pattern, re.U)
 | 
| 142 | 
 | 
| 143 | 
 | 
| 144 |     # -- Private methods -----------------------------------------------
 | 
| 145 |     # (possibly useful for subclasses to override)
 | 
| 146 | 
 | 
| 147 |     def _munge_whitespace(self, text):
 | 
| 148 |         """_munge_whitespace(text : string) -> string
 | 
| 149 | 
 | 
| 150 |         Munge whitespace in text: expand tabs and convert all other
 | 
| 151 |         whitespace characters to spaces.  Eg. " foo\\tbar\\n\\nbaz"
 | 
| 152 |         becomes " foo    bar  baz".
 | 
| 153 |         """
 | 
| 154 |         if self.expand_tabs:
 | 
| 155 |             text = text.expandtabs()
 | 
| 156 |         if self.replace_whitespace:
 | 
| 157 |             if isinstance(text, str):
 | 
| 158 |                 text = text.translate(self.whitespace_trans)
 | 
| 159 |             elif isinstance(text, _unicode):
 | 
| 160 |                 text = text.translate(self.unicode_whitespace_trans)
 | 
| 161 |         return text
 | 
| 162 | 
 | 
| 163 | 
 | 
| 164 |     def _split(self, text):
 | 
| 165 |         """_split(text : string) -> [string]
 | 
| 166 | 
 | 
| 167 |         Split the text to wrap into indivisible chunks.  Chunks are
 | 
| 168 |         not quite the same as words; see _wrap_chunks() for full
 | 
| 169 |         details.  As an example, the text
 | 
| 170 |           Look, goof-ball -- use the -b option!
 | 
| 171 |         breaks into the following chunks:
 | 
| 172 |           'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
 | 
| 173 |           'use', ' ', 'the', ' ', '-b', ' ', 'option!'
 | 
| 174 |         if break_on_hyphens is True, or in:
 | 
| 175 |           'Look,', ' ', 'goof-ball', ' ', '--', ' ',
 | 
| 176 |           'use', ' ', 'the', ' ', '-b', ' ', option!'
 | 
| 177 |         otherwise.
 | 
| 178 |         """
 | 
| 179 |         if isinstance(text, _unicode):
 | 
| 180 |             if self.break_on_hyphens:
 | 
| 181 |                 pat = self.wordsep_re_uni
 | 
| 182 |             else:
 | 
| 183 |                 pat = self.wordsep_simple_re_uni
 | 
| 184 |         else:
 | 
| 185 |             if self.break_on_hyphens:
 | 
| 186 |                 pat = self.wordsep_re
 | 
| 187 |             else:
 | 
| 188 |                 pat = self.wordsep_simple_re
 | 
| 189 |         chunks = pat.split(text)
 | 
| 190 |         chunks = filter(None, chunks)  # remove empty chunks
 | 
| 191 |         return chunks
 | 
| 192 | 
 | 
| 193 |     def _fix_sentence_endings(self, chunks):
 | 
| 194 |         """_fix_sentence_endings(chunks : [string])
 | 
| 195 | 
 | 
| 196 |         Correct for sentence endings buried in 'chunks'.  Eg. when the
 | 
| 197 |         original text contains "... foo.\\nBar ...", munge_whitespace()
 | 
| 198 |         and split() will convert that to [..., "foo.", " ", "Bar", ...]
 | 
| 199 |         which has one too few spaces; this method simply changes the one
 | 
| 200 |         space to two.
 | 
| 201 |         """
 | 
| 202 |         i = 0
 | 
| 203 |         patsearch = self.sentence_end_re.search
 | 
| 204 |         while i < len(chunks)-1:
 | 
| 205 |             if chunks[i+1] == " " and patsearch(chunks[i]):
 | 
| 206 |                 chunks[i+1] = "  "
 | 
| 207 |                 i += 2
 | 
| 208 |             else:
 | 
| 209 |                 i += 1
 | 
| 210 | 
 | 
| 211 |     def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
 | 
| 212 |         """_handle_long_word(chunks : [string],
 | 
| 213 |                              cur_line : [string],
 | 
| 214 |                              cur_len : int, width : int)
 | 
| 215 | 
 | 
| 216 |         Handle a chunk of text (most likely a word, not whitespace) that
 | 
| 217 |         is too long to fit in any line.
 | 
| 218 |         """
 | 
| 219 |         # Figure out when indent is larger than the specified width, and make
 | 
| 220 |         # sure at least one character is stripped off on every pass
 | 
| 221 |         if width < 1:
 | 
| 222 |             space_left = 1
 | 
| 223 |         else:
 | 
| 224 |             space_left = width - cur_len
 | 
| 225 | 
 | 
| 226 |         # If we're allowed to break long words, then do so: put as much
 | 
| 227 |         # of the next chunk onto the current line as will fit.
 | 
| 228 |         if self.break_long_words:
 | 
| 229 |             cur_line.append(reversed_chunks[-1][:space_left])
 | 
| 230 |             reversed_chunks[-1] = reversed_chunks[-1][space_left:]
 | 
| 231 | 
 | 
| 232 |         # Otherwise, we have to preserve the long word intact.  Only add
 | 
| 233 |         # it to the current line if there's nothing already there --
 | 
| 234 |         # that minimizes how much we violate the width constraint.
 | 
| 235 |         elif not cur_line:
 | 
| 236 |             cur_line.append(reversed_chunks.pop())
 | 
| 237 | 
 | 
| 238 |         # If we're not allowed to break long words, and there's already
 | 
| 239 |         # text on the current line, do nothing.  Next time through the
 | 
| 240 |         # main loop of _wrap_chunks(), we'll wind up here again, but
 | 
| 241 |         # cur_len will be zero, so the next line will be entirely
 | 
| 242 |         # devoted to the long word that we can't handle right now.
 | 
| 243 | 
 | 
| 244 |     def _wrap_chunks(self, chunks):
 | 
| 245 |         """_wrap_chunks(chunks : [string]) -> [string]
 | 
| 246 | 
 | 
| 247 |         Wrap a sequence of text chunks and return a list of lines of
 | 
| 248 |         length 'self.width' or less.  (If 'break_long_words' is false,
 | 
| 249 |         some lines may be longer than this.)  Chunks correspond roughly
 | 
| 250 |         to words and the whitespace between them: each chunk is
 | 
| 251 |         indivisible (modulo 'break_long_words'), but a line break can
 | 
| 252 |         come between any two chunks.  Chunks should not have internal
 | 
| 253 |         whitespace; ie. a chunk is either all whitespace or a "word".
 | 
| 254 |         Whitespace chunks will be removed from the beginning and end of
 | 
| 255 |         lines, but apart from that whitespace is preserved.
 | 
| 256 |         """
 | 
| 257 |         lines = []
 | 
| 258 |         if self.width <= 0:
 | 
| 259 |             raise ValueError("invalid width %r (must be > 0)" % self.width)
 | 
| 260 | 
 | 
| 261 |         # Arrange in reverse order so items can be efficiently popped
 | 
| 262 |         # from a stack of chucks.
 | 
| 263 |         chunks.reverse()
 | 
| 264 | 
 | 
| 265 |         while chunks:
 | 
| 266 | 
 | 
| 267 |             # Start the list of chunks that will make up the current line.
 | 
| 268 |             # cur_len is just the length of all the chunks in cur_line.
 | 
| 269 |             cur_line = []
 | 
| 270 |             cur_len = 0
 | 
| 271 | 
 | 
| 272 |             # Figure out which static string will prefix this line.
 | 
| 273 |             if lines:
 | 
| 274 |                 indent = self.subsequent_indent
 | 
| 275 |             else:
 | 
| 276 |                 indent = self.initial_indent
 | 
| 277 | 
 | 
| 278 |             # Maximum width for this line.
 | 
| 279 |             width = self.width - len(indent)
 | 
| 280 | 
 | 
| 281 |             # First chunk on line is whitespace -- drop it, unless this
 | 
| 282 |             # is the very beginning of the text (ie. no lines started yet).
 | 
| 283 |             if self.drop_whitespace and chunks[-1].strip() == '' and lines:
 | 
| 284 |                 del chunks[-1]
 | 
| 285 | 
 | 
| 286 |             while chunks:
 | 
| 287 |                 l = len(chunks[-1])
 | 
| 288 | 
 | 
| 289 |                 # Can at least squeeze this chunk onto the current line.
 | 
| 290 |                 if cur_len + l <= width:
 | 
| 291 |                     cur_line.append(chunks.pop())
 | 
| 292 |                     cur_len += l
 | 
| 293 | 
 | 
| 294 |                 # Nope, this line is full.
 | 
| 295 |                 else:
 | 
| 296 |                     break
 | 
| 297 | 
 | 
| 298 |             # The current line is full, and the next chunk is too big to
 | 
| 299 |             # fit on *any* line (not just this one).
 | 
| 300 |             if chunks and len(chunks[-1]) > width:
 | 
| 301 |                 self._handle_long_word(chunks, cur_line, cur_len, width)
 | 
| 302 | 
 | 
| 303 |             # If the last chunk on this line is all whitespace, drop it.
 | 
| 304 |             if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
 | 
| 305 |                 del cur_line[-1]
 | 
| 306 | 
 | 
| 307 |             # Convert current line back to a string and store it in list
 | 
| 308 |             # of all lines (return value).
 | 
| 309 |             if cur_line:
 | 
| 310 |                 lines.append(indent + ''.join(cur_line))
 | 
| 311 | 
 | 
| 312 |         return lines
 | 
| 313 | 
 | 
| 314 | 
 | 
| 315 |     # -- Public interface ----------------------------------------------
 | 
| 316 | 
 | 
| 317 |     def wrap(self, text):
 | 
| 318 |         """wrap(text : string) -> [string]
 | 
| 319 | 
 | 
| 320 |         Reformat the single paragraph in 'text' so it fits in lines of
 | 
| 321 |         no more than 'self.width' columns, and return a list of wrapped
 | 
| 322 |         lines.  Tabs in 'text' are expanded with string.expandtabs(),
 | 
| 323 |         and all other whitespace characters (including newline) are
 | 
| 324 |         converted to space.
 | 
| 325 |         """
 | 
| 326 |         text = self._munge_whitespace(text)
 | 
| 327 |         chunks = self._split(text)
 | 
| 328 |         if self.fix_sentence_endings:
 | 
| 329 |             self._fix_sentence_endings(chunks)
 | 
| 330 |         return self._wrap_chunks(chunks)
 | 
| 331 | 
 | 
| 332 |     def fill(self, text):
 | 
| 333 |         """fill(text : string) -> string
 | 
| 334 | 
 | 
| 335 |         Reformat the single paragraph in 'text' to fit in lines of no
 | 
| 336 |         more than 'self.width' columns, and return a new string
 | 
| 337 |         containing the entire wrapped paragraph.
 | 
| 338 |         """
 | 
| 339 |         return "\n".join(self.wrap(text))
 | 
| 340 | 
 | 
| 341 | 
 | 
| 342 | # -- Convenience interface ---------------------------------------------
 | 
| 343 | 
 | 
| 344 | def wrap(text, width=70, **kwargs):
 | 
| 345 |     """Wrap a single paragraph of text, returning a list of wrapped lines.
 | 
| 346 | 
 | 
| 347 |     Reformat the single paragraph in 'text' so it fits in lines of no
 | 
| 348 |     more than 'width' columns, and return a list of wrapped lines.  By
 | 
| 349 |     default, tabs in 'text' are expanded with string.expandtabs(), and
 | 
| 350 |     all other whitespace characters (including newline) are converted to
 | 
| 351 |     space.  See TextWrapper class for available keyword args to customize
 | 
| 352 |     wrapping behaviour.
 | 
| 353 |     """
 | 
| 354 |     w = TextWrapper(width=width, **kwargs)
 | 
| 355 |     return w.wrap(text)
 | 
| 356 | 
 | 
| 357 | def fill(text, width=70, **kwargs):
 | 
| 358 |     """Fill a single paragraph of text, returning a new string.
 | 
| 359 | 
 | 
| 360 |     Reformat the single paragraph in 'text' to fit in lines of no more
 | 
| 361 |     than 'width' columns, and return a new string containing the entire
 | 
| 362 |     wrapped paragraph.  As with wrap(), tabs are expanded and other
 | 
| 363 |     whitespace characters converted to space.  See TextWrapper class for
 | 
| 364 |     available keyword args to customize wrapping behaviour.
 | 
| 365 |     """
 | 
| 366 |     w = TextWrapper(width=width, **kwargs)
 | 
| 367 |     return w.fill(text)
 | 
| 368 | 
 | 
| 369 | 
 | 
| 370 | # -- Loosely related functionality -------------------------------------
 | 
| 371 | 
 | 
| 372 | _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
 | 
| 373 | _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
 | 
| 374 | 
 | 
| 375 | def dedent(text):
 | 
| 376 |     """Remove any common leading whitespace from every line in `text`.
 | 
| 377 | 
 | 
| 378 |     This can be used to make triple-quoted strings line up with the left
 | 
| 379 |     edge of the display, while still presenting them in the source code
 | 
| 380 |     in indented form.
 | 
| 381 | 
 | 
| 382 |     Note that tabs and spaces are both treated as whitespace, but they
 | 
| 383 |     are not equal: the lines "  hello" and "\\thello" are
 | 
| 384 |     considered to have no common leading whitespace.  (This behaviour is
 | 
| 385 |     new in Python 2.5; older versions of this module incorrectly
 | 
| 386 |     expanded tabs before searching for common leading whitespace.)
 | 
| 387 |     """
 | 
| 388 |     # Look for the longest leading string of spaces and tabs common to
 | 
| 389 |     # all lines.
 | 
| 390 |     margin = None
 | 
| 391 |     text = _whitespace_only_re.sub('', text)
 | 
| 392 |     indents = _leading_whitespace_re.findall(text)
 | 
| 393 |     for indent in indents:
 | 
| 394 |         if margin is None:
 | 
| 395 |             margin = indent
 | 
| 396 | 
 | 
| 397 |         # Current line more deeply indented than previous winner:
 | 
| 398 |         # no change (previous winner is still on top).
 | 
| 399 |         elif indent.startswith(margin):
 | 
| 400 |             pass
 | 
| 401 | 
 | 
| 402 |         # Current line consistent with and no deeper than previous winner:
 | 
| 403 |         # it's the new winner.
 | 
| 404 |         elif margin.startswith(indent):
 | 
| 405 |             margin = indent
 | 
| 406 | 
 | 
| 407 |         # Find the largest common whitespace between current line and previous
 | 
| 408 |         # winner.
 | 
| 409 |         else:
 | 
| 410 |             for i, (x, y) in enumerate(zip(margin, indent)):
 | 
| 411 |                 if x != y:
 | 
| 412 |                     margin = margin[:i]
 | 
| 413 |                     break
 | 
| 414 |             else:
 | 
| 415 |                 margin = margin[:len(indent)]
 | 
| 416 | 
 | 
| 417 |     # sanity check (testing/debugging only)
 | 
| 418 |     if 0 and margin:
 | 
| 419 |         for line in text.split("\n"):
 | 
| 420 |             assert not line or line.startswith(margin), \
 | 
| 421 |                    "line = %r, margin = %r" % (line, margin)
 | 
| 422 | 
 | 
| 423 |     if margin:
 | 
| 424 |         text = re.sub(r'(?m)^' + margin, '', text)
 | 
| 425 |     return text
 | 
| 426 | 
 | 
| 427 | if __name__ == "__main__":
 | 
| 428 |     #print dedent("\tfoo\n\tbar")
 | 
| 429 |     #print dedent("  \thello there\n  \t  how are you?")
 | 
| 430 |     print(dedent("Hello there.\n  This is indented."))
 |