,

Python – Image Similarity Comparison Using Several Techniques

image_similarity.py

# -*- coding: utf-8 -*-
"""

Installation of needed libraries

sudo apt-get install -y python-pip
sudo pip install PIL numpy

"""
import os, time, re, urllib
from PIL import Image
import logging
format= '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(funcName)s() - %(message)s'
format= '%(asctime)s - %(filename)s:%(lineno)s - %(message)s'
logging.basicConfig(level=logging.DEBUG, format=format)
logger = logging.getLogger(__name__)


def main():
    download_base_directory = '/tmp/imagesimilarity/'
    urls_image_pairs="""
    http://www.linuxfestnorthwest.org/sites/default/files/sponsors/elephant.png
    http://www.linuxfestnorthwest.org/sites/default/files/sponsors/elephant.png
    #######
    http://www.linuxfestnorthwest.org/sites/default/files/sponsors/elephant.png
    http://terminaltwister.com/wp-content/uploads/2013/09/220px-Postgresql_elephant.svg_.png
    #######
    """
    begin_similarty_compare(urls_image_pairs, download_base_directory)

def begin_similarty_compare(url_texts, download_base_directory):
    logger.debug("image file location: %s"%(download_base_directory ))
    url_pairs = re.split('#+',url_texts)
    urls = url_texts.strip().split()
    idx=0 # counter for downloaded image names
    for url_text in url_pairs:
        pair = url_text.strip().split()
        if not pair:
            continue
        filepath_url = []
        for url in pair:
            url = url.strip()
            filename = url.split('/')[-1]
            idx+=1
            filename = "%02.f-%s"%(idx,filename) # creates unique enumerated filenames
            #logger.debug("filename %s"%(filename))
            filepath = os.path.join(download_base_directory, filename)
            mkdir_p_filepath(filepath)
            if not os.path.exists(filepath):
                logger.debug("downloading image %s to %s ..."%(url, base_directory))
                urllib.urlretrieve(url, filepath)
                logger.debug("downloading done")
            filepath_url.append( (filepath, url) )

        logger.debug("*"*20)
        logger.debug("compare images start")

        image_filepath1, url1 = filepath_url[0][0], filepath_url[0][1]
        logger.debug("image1: %s (%s)"%(get_filename(image_filepath1),url1))

        image_filepath2, url2 = filepath_url[1][0], filepath_url[1][1]
        logger.debug("image2: %s (%s)"%(get_filename(image_filepath2),url2))

        t1=time.time()

        similarity = image_similarity_bands_via_numpy(image_filepath1,image_filepath2)

        duration = "%0.1f"%((time.time() - t1)*1000)
        logger.debug("image_similarity_bands_via_numpy  => %s took %s ms"%(similarity,duration ))

        t1=time.time()

        similarity = image_similarity_histogram_via_pil(image_filepath1,image_filepath2)

        duration = "%0.1f"%((time.time() - t1)*1000)
        logger.debug("image_similarity_histogram_via_pil => %s took %s ms"%(similarity,duration ))

        t1=time.time()

        similarity = image_similarity_vectors_via_numpy(image_filepath1,image_filepath2)

        duration = "%0.1f"%((time.time() - t1)*1000)
        logger.debug("image_similarity_vectors_via_numpy => %s took %s ms"%(similarity,duration ))

        t1=time.time()

        similarity = image_similarity_greyscale_hash_code(image_filepath1,image_filepath2)

        duration = "%0.1f"%((time.time() - t1)*1000)
        logger.debug("image_similarity_greyscale_hash_code => %s took %s ms"%(similarity,duration ))


        logger.debug("compare images finished")


def image_similarity_bands_via_numpy(filepath1, filepath2):
    import math
    import operator
    import numpy
    image1 = Image.open(filepath1)
    image2 = Image.open(filepath2)

    # create thumbnails - resize em
    image1 = get_thumbnail(image1)
    image2 = get_thumbnail(image2)

    # this eliminated unqual images - though not so smarts....
    if image1.size != image2.size or image1.getbands() != image2.getbands():
        return -1
    s = 0
    for band_index, band in enumerate(image1.getbands()):
        m1 = numpy.array([p[band_index] for p in image1.getdata()]).reshape(*image1.size)
        m2 = numpy.array([p[band_index] for p in image2.getdata()]).reshape(*image2.size)
        s += numpy.sum(numpy.abs(m1-m2))
    return s

def image_similarity_histogram_via_pil(filepath1, filepath2):
    from PIL import Image
    import math
    import operator

    image1 = Image.open(filepath1)
    image2 = Image.open(filepath2)

    image1 = get_thumbnail(image1)
    image2 = get_thumbnail(image2)

    h1 = image1.histogram()
    h2 = image2.histogram()

    rms = math.sqrt(reduce(operator.add,  list(map(lambda a,b: (a-b)**2, h1, h2)))/len(h1) )
    return rms

def image_similarity_vectors_via_numpy(filepath1, filepath2):
    # source: http://www.syntacticbayleaves.com/2008/12/03/determining-image-similarity/
    # may throw: Value Error: matrices are not aligned .
    import Image
    from numpy import average, linalg, dot
    import sys

    image1 = Image.open(filepath1)
    image2 = Image.open(filepath2)

    image1 = get_thumbnail(image1, stretch_to_fit=True)
    image2 = get_thumbnail(image2, stretch_to_fit=True)

    images = [image1, image2]
    vectors = []
    norms = []
    for image in images:
        vector = []
        for pixel_tuple in image.getdata():
            vector.append(average(pixel_tuple))
        vectors.append(vector)
        norms.append(linalg.norm(vector, 2))
    a, b = vectors
    a_norm, b_norm = norms
    # ValueError: matrices are not aligned !
    res = dot(a / a_norm, b / b_norm)
    return res

def image_similarity_greyscale_hash_code(filepath1, filepath2):
    # source: http://blog.safariflow.com/2013/11/26/image-hashing-with-python/

    image1 = Image.open(filepath1)
    image2 = Image.open(filepath2)

    image1 = get_thumbnail(image1, greyscale=True)
    image2 = get_thumbnail(image2, greyscale=True)

    code1 = image_pixel_hash_code(image1)
    code2 = image_pixel_hash_code(image2)
    # use hamming distance to compare hashes
    res = hamming_distance(code1,code2)
    return res

def image_pixel_hash_code(image):
    pixels = list(image.getdata())
    avg = sum(pixels) / len(pixels)
    bits = "".join(map(lambda pixel: '1' if pixel < avg else '0', pixels))  # '00010100...'
    hexadecimal = int(bits, 2).__format__('016x').upper()
    return hexadecimal

def hamming_distance(s1, s2):
    len1, len2= len(s1),len(s2)
    if len1!=len2:
        "hamming distance works only for string of the same length, so i'll chop the longest sequence"
        if len1>len2:
            s1=s1[:-(len1-len2)]
        else:
            s2=s2[:-(len2-len1)]
    assert len(s1) == len(s2)
    return sum([ch1 != ch2 for ch1, ch2 in zip(s1, s2)])

def get_thumbnail(image, size=(128,128), stretch_to_fit=False, greyscale=False):
    " get a smaller version of the image - makes comparison much faster/easier"
    if not stretch_to_fit:
        image.thumbnail(size, Image.ANTIALIAS)
    else:
        image = image.resize(size); # for faster computation
    if greyscale:
        image = image.convert("L")  # Convert it to grayscale.
    return image

def mkdir_p_filepath(path):
    dirpath = os.path.dirname(os.path.abspath(path))
    mkdir_p(dirpath)

def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc: # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise

def get_filename(path):
    # cross plattform filename from a given path
    # source: http://stackoverflow.com/questions/8384737/python-extract-file-name-from-path-no-matter-what-the-os-path-format
    import ntpath
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)


if __name__ == "__main__":
    main()


Testing


a@t400:~/lab/image-similarity$ python image_similarity.py
2013-12-22 20:28:19,507 - image_similarity.py:32 - image file location: /tmp/imagesimilarity/
2013-12-22 20:28:19,507 - image_similarity.py:55 - ********************
2013-12-22 20:28:19,507 - image_similarity.py:56 - compare images start
2013-12-22 20:28:19,509 - image_similarity.py:59 - image1: 01-elephant.png (http://www.linuxfestnorthwest.org/sites/default/files/sponsors/elephant.png)
2013-12-22 20:28:19,509 - image_similarity.py:62 - image2: 02-elephant.png (http://www.linuxfestnorthwest.org/sites/default/files/sponsors/elephant.png)
2013-12-22 20:28:19,958 - image_similarity.py:69 - image_similarity_bands_via_numpy  => 0 took 448.1 ms
2013-12-22 20:28:20,242 - image_similarity.py:76 - image_similarity_histogram_via_pil => 0.0 took 284.0 ms
2013-12-22 20:28:21,379 - image_similarity.py:83 - image_similarity_vectors_via_numpy => 1.0 took 1136.5 ms
2013-12-22 20:28:21,676 - image_similarity.py:90 - image_similarity_greyscale_hash_code => 0 took 296.9 ms
2013-12-22 20:28:21,676 - image_similarity.py:93 - compare images finished
2013-12-22 20:28:21,676 - image_similarity.py:55 - ********************
2013-12-22 20:28:21,676 - image_similarity.py:56 - compare images start
2013-12-22 20:28:21,677 - image_similarity.py:59 - image1: 03-elephant.png (http://www.linuxfestnorthwest.org/sites/default/files/sponsors/elephant.png)
2013-12-22 20:28:21,677 - image_similarity.py:62 - image2: 04-220px-Postgresql_elephant.svg_.png (http://terminaltwister.com/wp-content/uploads/2013/09/220px-Postgresql_elephant.svg_.png)
2013-12-22 20:28:21,831 - image_similarity.py:69 - image_similarity_bands_via_numpy  => -1 took 153.8 ms
2013-12-22 20:28:21,985 - image_similarity.py:76 - image_similarity_histogram_via_pil => 509.38197848 took 154.5 ms
2013-12-22 20:28:23,094 - image_similarity.py:83 - image_similarity_vectors_via_numpy => 0.809369331383 took 1108.0 ms
2013-12-22 20:28:23,260 - image_similarity.py:90 - image_similarity_greyscale_hash_code => 3207 took 166.5 ms
2013-12-22 20:28:23,261 - image_similarity.py:93 - compare images finished

References

1. Chris Pickett, greyscale hash code: http://blog.safariflow.com/2013/11/26/image-hashing-with-python/
2. AutomatedTester, fortran: http://stackoverflow.com/questions/1927660/compare-two-images-the-python-linux-way/
3. Peter Hansen: http://stackoverflow.com/questions/1819124/image-comparison-algorithm
4. Carlo Cabanilla, http://www.syntacticbayleaves.com/2008/12/03/determining-image-similarity/
5. Attila Oláh, imgcmp.py: https://gist.github.com/attilaolah/1940208