Datasaurus Dozen Particle Distributions

The goal of this short demo is to explore the Datasaurus Dozen dataset, a dataset famous for having different distributions with the same statistical properties, with a technique from analyzing particle distributions: the pair distribution function. The pair distribution function is rotation and translation invariant, while most of the stats measured of these plots are not, so it is interesting to see which pair distribution plots look similar.

Pair Distribution Plot

import pandas
from IPython.display import display
from ggplot import *
from math import hypot, atan2
import numpy as np

data_dict = dict(tuple(pandas.read_csv('DatasaurusDozen.tsv', sep='\t').groupby('dataset')))

data_dict

for title, frame in data_dict.items():
    plot = ggplot(aes(x='x', y='y'), data=frame) + \
    geom_point()
    
    display(title)
    display(plot)

'away'

png

'bullseye'

png

'circle'

png

'dino'

png

'dots'

png

'h_lines'

png

'high_lines'

png

'slant_down'

png

'slant_up'

png

'star'

png

'v_lines'

png

'wide_lines'

png

'x_shape'

png

XY Pairs

for title, frame in data_dict.items():

    length = []
    
    for x0, y0 in zip(frame.x.tolist(), frame.y.tolist()):
        for x1, y1 in zip(frame.x.tolist(), frame.y.tolist()):
            r = hypot(x1-x0, y1-y0)
            length.append(r)
    data = pandas.DataFrame({'length': length})
    plot = ggplot(aes(x='length'), data=data) + geom_histogram(binwidth=1)
    display(title)
    display(plot)

    

'away'

png

'bullseye'

png

'circle'

png

'dino'

png

'dots'

png

'h_lines'

png

'high_lines'

png

'slant_down'

png

'slant_up'

png

'star'

png

'v_lines'

png

'wide_lines'

png

'x_shape'

png