Unicode encoding and decoding ¶
Check or change the encoding of a file
To change encoding of given file:
$ iconv -f ISO-8859-1 -t UTF-8 source.txt > target.txt
to list all supported encodings:
$ iconv -l
How to determine file encoding:
$ file -i {filename}
or (on osx):
$ file -I {filename}
Also check man page for enca:
$ enca --help
If everything fails, install the following Python character encoding detector:
$ pip install chardet $ chardetect input_filename
Python snippet to fix encoding
def fix_encoding(text): """ force utf-8 encoding """ encodings = ('iso-8859-15','utf-8','ascii') success = False for encoding in encodings: try: utext = text.decode(encoding) success = True break except: success = False if success: return utext.encode('utf-8') return text
Pretty print considering encoding
Instead of:
from pprint import pprint a = [0, 1, ['a', 'b', 'c'], 2, 3, 4] pprint(a)
do this ...
# coding=utf8 import pprint class MyPrettyPrinter(pprint.PrettyPrinter): def format(self, object, context, maxlevels, level): if isinstance(object, unicode): return (object.encode('utf8'), True, False) return pprint.PrettyPrinter.format(self, object, context, maxlevels, level) d = {'foo': u'işüğçö'} pprint.pprint(d) # {'foo': u'i\u015f\xfc\u011f\xe7\xf6'} MyPrettyPrinter().pprint(d) # {'foo': işüğçö}
How to get StringIO.write to accept unicode string
from StringIO import StringIO import codecs #buffer = StringIO(importazione.content) buffer = StringIO() codecinfo = codecs.lookup("utf8") wrapper = codecs.StreamReaderWriter(buffer, codecinfo.streamreader, codecinfo.streamwriter) wrapper.write(importazione.content) buffer.seek(0)
from StringIO import StringIO import codecs #buffer = StringIO(importazione.content) buffer = StringIO() wrapper = codecs.getwriter("utf8")(buffer) wrapper.write(importazione.content) buffer.seek(0)
How to write UTF-8 in a CSV file
pip2 install unicodecsv
import unicodecsv as csv out = open(filename, 'w') writer = csv.writer(out, dialect='excel', encoding='utf-8')
References: https://stackoverflow.com/questions/18766955/how-to-write-utf-8-in-a-csv-file#31642070
UnicodeDictReader: a cvs.DictReader class which understands unicode
import csv class UnicodeDictReader: """ A CSV reader which will iterate over lines in the CSV file "f", which is encoded in the given encoding. Usage: reader = UnicodeDictReader(input_file) """ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): self.encoding = encoding self.reader = csv.DictReader(f, dialect=dialect, **kwds) def next(self): row = self.reader.next() dictrow = {k: unicode(v, "utf-8") for k, v in row.iteritems()} return dictrow def __iter__(self): return self
UnicodeWriter: a csv.writer class which understands unicode
import csv import cStringIO import codecs class UnicodeWriter: """ A CSV writer which will write rows to CSV file "f", which is encoded in the given encoding. Sample usage: writer = UnicodeWriter(outbuffer, dialect='excel', encoding="windows-1252", quoting=csv.QUOTE_NONNUMERIC, delimiter=';') """ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): # Redirect output to a queue self.queue = cStringIO.StringIO() self.writer = csv.writer(self.queue, dialect=dialect, **kwds) self.stream = f self.encoder = codecs.getincrementalencoder(encoding)() def writerow(self, row): row = [v.decode('utf8') if isinstance(v, str) else v for v in row] self.writer.writerow([unicode(s).encode("utf-8") for s in row]) # Fetch UTF-8 output from the queue ... data = self.queue.getvalue() data = data.decode("utf-8") # ... and reencode it into the target encoding data = self.encoder.encode(data) # write to the target stream self.stream.write(data) # empty queue self.queue.truncate(0) def writerows(self, rows): for row in rows: self.writerow(row)
Decoding example
'xe4' is a Latin-1 encoded string representing the unicode ä character.
To explicitly decode the pyodbc result in Python 2.7
>>> res = '\xe4' >>> res.decode('latin1'), type(res.decode('latin1')) (u'\xe4', <type 'unicode'>) >>> print res.decode('latin1') ä
Python 3.x does this for you (the str type includes unicode characters):
>>> res = '\xe4' >>> res, type(res) ('ä', <class 'str'>) >>> print(res) ä