Privacy Policy
Snippets index

  Unicode encoding and decoding

Check or change the encoding of a file

To change encoding of given file:

$ iconv -f ISO-8859-1 -t UTF-8 source.txt > target.txt

to list all supported encodings:

$ iconv -l

How to determine file encoding:

$ file -i {filename}

or (on osx):

$ file -I {filename}

Also check man page for enca:

$ enca --help

If everything fails, install the following Python character encoding detector:

$ pip install chardet
$ chardetect input_filename

Python snippet to fix encoding

def fix_encoding(text):
    """ force utf-8 encoding """
    encodings = ('iso-8859-15','utf-8','ascii')
    success = False
    for encoding in encodings:
        try:
            utext = text.decode(encoding)
            success = True
            break
        except:
            success = False
    if success:
        return utext.encode('utf-8')
    return text

Pretty print considering encoding

Instead of:

from pprint import pprint

a = [0, 1, ['a', 'b', 'c'], 2, 3, 4]
pprint(a)

do this ...

# coding=utf8

import pprint

class MyPrettyPrinter(pprint.PrettyPrinter):
    def format(self, object, context, maxlevels, level):
        if isinstance(object, unicode):
            return (object.encode('utf8'), True, False)
        return pprint.PrettyPrinter.format(self, object, context, maxlevels, level)


d = {'foo': u'işüğçö'}

pprint.pprint(d)              # {'foo': u'i\u015f\xfc\u011f\xe7\xf6'}
MyPrettyPrinter().pprint(d)   # {'foo': işüğçö}

Credits:

How to get StringIO.write to accept unicode string

from StringIO import StringIO
import codecs

#buffer = StringIO(importazione.content)
buffer = StringIO()
codecinfo = codecs.lookup("utf8")
wrapper = codecs.StreamReaderWriter(buffer, codecinfo.streamreader, codecinfo.streamwriter)
wrapper.write(importazione.content)
buffer.seek(0)

or:

from StringIO import StringIO
import codecs

#buffer = StringIO(importazione.content)
buffer = StringIO()
wrapper = codecs.getwriter("utf8")(buffer)
wrapper.write(importazione.content)
buffer.seek(0)

Credits:

How to write UTF-8 in a CSV file

pip2 install unicodecsv

then:

import unicodecsv as csv

out = open(filename, 'w')
writer = csv.writer(out, dialect='excel', encoding='utf-8')

References: https://stackoverflow.com/questions/18766955/how-to-write-utf-8-in-a-csv-file#31642070

UnicodeDictReader: a cvs.DictReader class which understands unicode

import csv

class UnicodeDictReader:
    """
    A CSV reader which will iterate over lines in the CSV file "f",
    which is encoded in the given encoding.

    Usage: reader = UnicodeDictReader(input_file)
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        self.encoding = encoding
        self.reader = csv.DictReader(f, dialect=dialect, **kwds)

    def next(self):
        row = self.reader.next()
        dictrow = {k: unicode(v, "utf-8") for k, v in row.iteritems()}
        return dictrow

    def __iter__(self):
        return self

Credits:

UnicodeWriter: a csv.writer class which understands unicode

import csv
import cStringIO
import codecs


class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.

    Sample usage:

        writer = UnicodeWriter(outbuffer, dialect='excel', encoding="windows-1252",
            quoting=csv.QUOTE_NONNUMERIC, delimiter=';')
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        row = [v.decode('utf8') if isinstance(v, str) else v for v in row]
        self.writer.writerow([unicode(s).encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

Credits:

Decoding example

'xe4' is a Latin-1 encoded string representing the unicode ä character.

To explicitly decode the pyodbc result in Python 2.7

>>> res = '\xe4'
>>> res.decode('latin1'), type(res.decode('latin1'))
(u'\xe4', <type 'unicode'>)
>>> print res.decode('latin1')
ä

Python 3.x does this for you (the str type includes unicode characters):

>>> res = '\xe4'
>>> res, type(res)
('ä', <class 'str'>)
>>> print(res)
ä