(original) (raw)
changeset: 84587:1f3c57ac07ed branch: 3.3 parent: 84558:4acb822f4c43 user: R David Murray rdmurray@bitdance.com date: Fri Jul 12 16:00:28 2013 -0400 files: Lib/email/_header_value_parser.py Lib/test/test_email/test__header_value_parser.py Lib/test/test_email/test_headerregistry.py Misc/NEWS description: #18431: Decode encoded words in atoms in new email parser. There is more to be done here in terms of accepting RFC invalid input that some mailers accept, but this covers the valid RFC places where encoded words can occur in structured headers. diff -r 4acb822f4c43 -r 1f3c57ac07ed Lib/email/_header_value_parser.py --- a/Lib/email/_header_value_parser.py Thu Jul 11 15:52:57 2013 -0400 +++ b/Lib/email/_header_value_parser.py Fri Jul 12 16:00:28 2013 -0400 @@ -1627,6 +1627,7 @@ def get_atom(value): """atom = [CFWS] 1*atext [CFWS] + An atom could be an rfc2047 encoded word. """ atom = Atom() if value and value[0] in CFWS_LEADER: @@ -1635,7 +1636,15 @@ if value and value[0] in ATOM_ENDS: raise errors.HeaderParseError( "expected atom but found '{}'".format(value)) - token, value = get_atext(value) + if value.startswith('=?'): + try: + token, value = get_encoded_word(value) + except errors.HeaderParseError: + # XXX: need to figure out how to register defects when + # appropriate here. + token, value = get_atext(value) + else: + token, value = get_atext(value) atom.append(token) if value and value[0] in CFWS_LEADER: token, value = get_cfws(value) @@ -1664,12 +1673,22 @@ def get_dot_atom(value): """ dot-atom = [CFWS] dot-atom-text [CFWS] + Any place we can have a dot atom, we could instead have an rfc2047 encoded + word. """ dot_atom = DotAtom() if value[0] in CFWS_LEADER: token, value = get_cfws(value) dot_atom.append(token) - token, value = get_dot_atom_text(value) + if value.startswith('=?'): + try: + token, value = get_encoded_word(value) + except errors.HeaderParseError: + # XXX: need to figure out how to register defects when + # appropriate here. + token, value = get_dot_atom_text(value) + else: + token, value = get_dot_atom_text(value) dot_atom.append(token) if value and value[0] in CFWS_LEADER: token, value = get_cfws(value) diff -r 4acb822f4c43 -r 1f3c57ac07ed Lib/test/test_email/test__header_value_parser.py --- a/Lib/test/test_email/test__header_value_parser.py Thu Jul 11 15:52:57 2013 -0400 +++ b/Lib/test/test_email/test__header_value_parser.py Fri Jul 12 16:00:28 2013 -0400 @@ -808,9 +808,13 @@ self.assertEqual(atom[2].comments, ['bar']) def test_get_atom_atom_ends_at_noncfws(self): - atom = self._test_get_x(parser.get_atom, + self._test_get_x(parser.get_atom, 'bob fred', 'bob ', 'bob ', [], 'fred') + def test_get_atom_rfc2047_atom(self): + self._test_get_x(parser.get_atom, + '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '') + # get_dot_atom_text def test_get_dot_atom_text(self): @@ -885,6 +889,10 @@ with self.assertRaises(errors.HeaderParseError): parser.get_dot_atom(' (foo) bar.bang. foo') + def test_get_dot_atom_rfc2047_atom(self): + self._test_get_x(parser.get_dot_atom, + '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '') + # get_word (if this were black box we'd repeat all the qs/atom tests) def test_get_word_atom_yields_atom(self): @@ -2156,6 +2164,22 @@ self.assertEqual(address[0].token_type, 'mailbox') + def test_get_address_rfc2047_display_name(self): + address = self._test_get_x(parser.get_address, + '=?utf-8?q?=C3=89ric?= foo@example.com', + 'Éric foo@example.com', + 'Éric foo@example.com', + [], + '') + self.assertEqual(address.token_type, 'address') + self.assertEqual(len(address.mailboxes), 1) + self.assertEqual(address.mailboxes, + address.all_mailboxes) + self.assertEqual(address.mailboxes[0].display_name, + 'Éric') + self.assertEqual(address[0].token_type, + 'mailbox') + def test_get_address_empty_group(self): address = self._test_get_x(parser.get_address, 'Monty Python:;', diff -r 4acb822f4c43 -r 1f3c57ac07ed Lib/test/test_email/test_headerregistry.py --- a/Lib/test/test_email/test_headerregistry.py Thu Jul 11 15:52:57 2013 -0400 +++ b/Lib/test/test_email/test_headerregistry.py Fri Jul 12 16:00:28 2013 -0400 @@ -158,6 +158,10 @@ '=?utf-8?q?=C3=89ric?=', 'Éric'), + 'rfc2047_quopri_with_regular_text': ( + 'The =?utf-8?q?=C3=89ric=2C?= Himself', + 'The Éric, Himself'), + } @@ -1119,6 +1123,26 @@ 'example.com', None), + 'rfc2047_atom_is_decoded': + ('=?utf-8?q?=C3=89ric?= foo@example.com', + [], + 'Éric foo@example.com', + 'Éric', + 'foo@example.com', + 'foo', + 'example.com', + None), + + 'rfc2047_atom_in_phrase_is_decoded': + ('The =?utf-8?q?=C3=89ric=2C?= Himself foo@example.com', + [], + '"The Éric, Himself" foo@example.com', + 'The Éric, Himself', + 'foo@example.com', + 'foo', + 'example.com', + None), + } # XXX: Need many more examples, and in particular some with names in diff -r 4acb822f4c43 -r 1f3c57ac07ed Misc/NEWS --- a/Misc/NEWS Thu Jul 11 15:52:57 2013 -0400 +++ b/Misc/NEWS Fri Jul 12 16:00:28 2013 -0400 @@ -47,6 +47,9 @@ Library ------- +- Issue #18431: The new email header parser now decodes RFC2047 encoded words + in structured headers. + - Issue #18044: The new email header parser was mis-parsing encoded words where an encoded character immediately followed the '?' that follows the CTE character, resulting in a decoding failure. They are now decoded correctly. /foo@example.com/foo@example.com/foo@example.com/foo@example.com/foo@example.com/foo@example.com/foo@example.com/rdmurray@bitdance.com