Changed decode_string to always return unicode.

This commit is contained in:
bendikro 2012-11-25 13:01:12 +01:00
parent ffb902ba06
commit 60f196ff93
2 changed files with 29 additions and 14 deletions

View file

@ -608,22 +608,34 @@ def xml_encode(string):
def decode_string(s, encoding="utf8"): def decode_string(s, encoding="utf8"):
""" """
Decodes a string and re-encodes it in utf8. If it cannot decode using Decodes a string and return unicode. If it cannot decode using
`:param:encoding` then it will try to detect the string encoding and `:param:encoding` then it will try latin1, and if that fails,
decode it. try to detect the string encoding. If that fails, decode with
ignore.
:param s: string to decode :param s: string to decode
:type s: string :type s: string
:keyword encoding: the encoding to use in the decoding :keyword encoding: the encoding to use in the decoding
:type encoding: string :type encoding: string
:returns: s converted to unicode
:rtype: unicode
""" """
if not s:
return u''
elif isinstance(s, unicode):
return s
try: encodings = [(encoding, 'strict'), ("utf8", 'strict'),
s = s.decode(encoding).encode("utf8", "ignore") ("iso-8859-1", 'strict'),
except UnicodeDecodeError: (chardet.detect(s)["encoding"], 'strict'),
s = s.decode(chardet.detect(s)["encoding"], "ignore").encode("utf8", "ignore") (chardet.detect(s)["encoding"], 'ignore')]
return s for i in range(len(encodings)):
try:
return s.decode(encodings[i][0], encodings[i][1])
except UnicodeDecodeError:
pass
return u''
def utf8_encoded(s): def utf8_encoded(s):
""" """
@ -636,7 +648,10 @@ def utf8_encoded(s):
""" """
if isinstance(s, str): if isinstance(s, str):
s = decode_string(s) try:
s = decode_string(s).encode("utf8")
except UnicodeEncodeError:
log.warn("Error when encoding to utf8: %s" % s)
elif isinstance(s, unicode): elif isinstance(s, unicode):
s = s.encode("utf8", "ignore") s = s.encode("utf8", "ignore")
return s return s

View file

@ -51,7 +51,7 @@ except ImportError:
from sha import sha from sha import sha
from deluge import bencode from deluge import bencode
from deluge.common import decode_string, path_join from deluge.common import utf8_encoded, path_join
import deluge.configmanager import deluge.configmanager
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -88,9 +88,9 @@ class TorrentInfo(object):
# Check if 'name.utf-8' is in the torrent and if not try to decode the string # Check if 'name.utf-8' is in the torrent and if not try to decode the string
# using the encoding found. # using the encoding found.
if "name.utf-8" in self.__m_metadata["info"]: if "name.utf-8" in self.__m_metadata["info"]:
self.__m_name = decode_string(self.__m_metadata["info"]["name.utf-8"]) self.__m_name = utf8_encoded(self.__m_metadata["info"]["name.utf-8"])
else: else:
self.__m_name = decode_string(self.__m_metadata["info"]["name"], self.encoding) self.__m_name = utf8_encoded(self.__m_metadata["info"]["name"], self.encoding)
# Get list of files from torrent info # Get list of files from torrent info
paths = {} paths = {}
@ -104,7 +104,7 @@ class TorrentInfo(object):
if "path.utf-8" in f: if "path.utf-8" in f:
path = os.path.join(prefix, *f["path.utf-8"]) path = os.path.join(prefix, *f["path.utf-8"])
else: else:
path = decode_string(os.path.join(prefix, decode_string(os.path.join(*f["path"]), self.encoding)), self.encoding) path = utf8_encoded(os.path.join(prefix, utf8_encoded(os.path.join(*f["path"]), self.encoding)), self.encoding)
f["index"] = index f["index"] = index
paths[path] = f paths[path] = f
@ -160,7 +160,7 @@ class TorrentInfo(object):
if "path.utf-8" in f: if "path.utf-8" in f:
path = os.path.join(prefix, *f["path.utf-8"]) path = os.path.join(prefix, *f["path.utf-8"])
else: else:
path = decode_string(os.path.join(prefix, decode_string(os.path.join(*f["path"]), self.encoding)), self.encoding) path = utf8_encoded(os.path.join(prefix, utf8_encoded(os.path.join(*f["path"]), self.encoding)), self.encoding)
self.__m_files.append({ self.__m_files.append({
'path': path, 'path': path,
'size': f["length"], 'size': f["length"],