Quickstart

Installation

Use PyPi to install the package.

pip install pypair

Confusion Matrix

A confusion matrix is typically used to judge binary classification performance. There are two variables, \(A\) and \(P\), where \(A\) is the actual value (ground truth) and \(P\) is the predicted value. The example below shows how to use the convenience method confusion() and the class ConfusionMatrix to get association measures derived from the confusion matrix.

 1from pypair.association import confusion
 2from pypair.contingency import ConfusionMatrix
 3
 4
 5def get_data():
 6    """
 7    Data taken from `here <https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/>`_.
 8    A pair of binary variables, `a` and `p`, are returned.
 9
10    :return: a, p
11    """
12    tn = [(0, 0) for _ in range(50)]
13    fp = [(0, 1) for _ in range(10)]
14    fn = [(1, 0) for _ in range(5)]
15    tp = [(1, 1) for _ in range(100)]
16    data = tn + fp + fn + tp
17    a = [a for a, _ in data]
18    p = [b for _, b in data]
19    return a, p
20
21
22a, p = get_data()
23
24# if you need to quickly get just one association measure
25r = confusion(a, p, measure='acc')
26print(r)
27
28print('-' * 15)
29
30# you can also get a list of available association measures
31# and loop over to call confusion(...)
32# this is more convenient, but less fast
33for m in ConfusionMatrix.measures():
34    r = confusion(a, p, m)
35    print(f'{r}: {m}')
36
37print('-' * 15)
38
39# if you need multiple association measures, then
40# build the confusion matrix table
41# this is less convenient, but much faster
42matrix = ConfusionMatrix(a, p)
43for m in matrix.measures():
44    r = matrix.get(m)
45    print(f'{r}: {m}')

Binary-Binary

Association measures for binary-binary variables are computed using binary_binary() or BinaryTable.

 1from pypair.association import binary_binary
 2from pypair.contingency import BinaryTable
 3
 4get_data = lambda x, y, n: [(x, y) for _ in range(n)]
 5data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
 6a = [a for a, _ in data]
 7b = [b for _, b in data]
 8
 9for m in BinaryTable.measures():
10    r = binary_binary(a, b, m)
11    print(f'{r}: {m}')
12
13print('-' * 15)
14
15table = BinaryTable(a, b)
16for m in table.measures():
17    r = table.get(m)
18    print(f'{r}: {m}')

Categorical-Categorical

Association measures for categorical-categorical variables are computed using categorical_categorical() or CategoricalTable.

 1from pypair.association import categorical_categorical
 2from pypair.contingency import CategoricalTable
 3
 4get_data = lambda x, y, n: [(x, y) for _ in range(n)]
 5data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
 6a = [a for a, _ in data]
 7b = [b for _, b in data]
 8
 9for m in CategoricalTable.measures():
10    r = categorical_categorical(a, b, m)
11    print(f'{r}: {m}')
12
13print('-' * 15)
14
15table = CategoricalTable(a, b)
16for m in table.measures():
17    r = table.get(m)
18    print(f'{r}: {m}')

Binary-Continuous

Association measures for binary-continuous variables are computed using binary_continuous() or Biserial.

 1from pypair.association import binary_continuous
 2from pypair.biserial import Biserial
 3
 4get_data = lambda x, y, n: [(x, y) for _ in range(n)]
 5data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
 6a = [a for a, _ in data]
 7b = [b for _, b in data]
 8
 9for m in Biserial.measures():
10    r = binary_continuous(a, b, m)
11    print(f'{r}: {m}')
12
13print('-' * 15)
14
15biserial = Biserial(a, b)
16for m in biserial.measures():
17    r = biserial.get(m)
18    print(f'{r}: {m}')

Ordinal-Ordinal, Concordance

Concordance measures are used for ordinal-ordinal or continuous-continuous variables using concordance() or Concordance().

 1from pypair.association import concordance
 2from pypair.continuous import Concordance
 3
 4a = [1, 2, 3]
 5b = [3, 2, 1]
 6
 7for m in Concordance.measures():
 8    r = concordance(a, b, m)
 9    print(f'{r}: {m}')
10
11print('-' * 15)
12
13con = Concordance(a, b)
14for m in con.measures():
15    r = con.get(m)
16    print(f'{r}: {m}')

Categorical-Continuous

Categorical-continuous association measures are computed using categorical_continuous() or CorrelationRatio.

 1from pypair.association import categorical_continuous
 2from pypair.continuous import CorrelationRatio
 3
 4data = [
 5    ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21),
 6    ('g', 40), ('g', 20), ('g', 30), ('g', 42),
 7    ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73)
 8]
 9x = [x for x, _ in data]
10y = [y for _, y in data]
11for m in CorrelationRatio.measures():
12    r = categorical_continuous(x, y, m)
13    print(f'{r}: {m}')
14
15print('-' * 15)
16
17cr = CorrelationRatio(x, y)
18for m in cr.measures():
19    r = cr.get(m)
20    print(f'{r}: {m}')

Continuous-Continuous

Association measures for continuous-continuous variables are computed using continuous_continuous() or Continuous.

 1from pypair.association import continuous_continuous
 2from pypair.continuous import Continuous
 3
 4x = [x for x in range(10)]
 5y = [y for y in range(10)]
 6
 7for m in Continuous.measures():
 8    r = continuous_continuous(x, y, m)
 9    print(f'{r}: {m}')
10
11print('-' * 15)
12
13con = Continuous(x, y)
14for m in con.measures():
15    r = con.get(m)
16    print(f'{r}: {m}')

Recipe

Here’s a recipe in using multiprocessing to compute pairwise association with binary data.

 1import pandas as pd
 2import numpy as np
 3import random
 4from random import randint
 5from pypair.association import binary_binary
 6from itertools import combinations
 7from multiprocessing import Pool
 8
 9np.random.seed(37)
10random.seed(37)
11
12def get_data(n_rows=1000, n_cols=5):
13    data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)]
14    cols = [f'x{i}' for i in range(n_cols)]
15    return pd.DataFrame(data, columns=cols)
16
17def compute(a, b, df):
18    x = df[a]
19    y = df[b]
20    return f'{a}_{b}', binary_binary(x, y, measure='jaccard')
21
22if __name__ == '__main__':
23    df = get_data()
24
25    with Pool(10) as pool:
26        pairs = ((a, b, df) for a, b in combinations(df.columns, 2))
27        bc = pool.starmap(compute, pairs)
28    
29    bc = sorted(bc, key=lambda tup: tup[0])
30    print(dict(bc))

Here’s a nifty utility method to create a correlation matrix. The input data frame must be all the same type and you must supply a function. Note that Pandas DataFrame.corr() no longer supports processing non-numeric data; fields that are not numeric will be simply skipped over. Why?

 1from random import randint
 2
 3import pandas as pd
 4
 5from pypair.association import binary_binary
 6from pypair.util import corr
 7
 8
 9def get_data(n_rows=1000, n_cols=5):
10    data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)]
11    cols = [f'x{i}' for i in range(n_cols)]
12    return pd.DataFrame(data, columns=cols)
13
14
15if __name__ == '__main__':
16    jaccard = lambda a, b: binary_binary(a, b, measure='jaccard')
17    tanimoto = lambda a, b: binary_binary(a, b, measure='tanimoto_i')
18
19    df = get_data()
20    jaccard_corr = corr(df, jaccard)
21    tanimoto_corr = corr(df, tanimoto)
22
23    print(jaccard_corr)
24    print('-' * 15)
25    print(tanimoto_corr)

Apache Spark

Spark is supported for some of the association measures. Active support is appreciated. Below are some code samples to get you started.

  1import json
  2from random import choice
  3
  4import pandas as pd
  5from pyspark.sql import SparkSession
  6
  7from pypair.spark import binary_binary, confusion, categorical_categorical, agreement, binary_continuous, concordance, \
  8    categorical_continuous, continuous_continuous
  9
 10
 11def _get_binary_binary_data(spark):
 12    """
 13    Gets dummy binary-binary data in a Spark dataframe.
 14
 15    :return: Spark dataframe.
 16    """
 17    get_data = lambda x, y, n: [(x, y) * 2 for _ in range(n)]
 18    data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
 19    pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
 20    sdf = spark.createDataFrame(pdf)
 21    return sdf
 22
 23
 24def _get_confusion_data(spark):
 25    """
 26    Gets dummy binary-binary data in Spark dataframe. For use with confusion matrix analysis.
 27
 28    :return: Spark dataframe.
 29    """
 30    tn = [(0, 0) * 2 for _ in range(50)]
 31    fp = [(0, 1) * 2 for _ in range(10)]
 32    fn = [(1, 0) * 2 for _ in range(5)]
 33    tp = [(1, 1) * 2 for _ in range(100)]
 34    data = tn + fp + fn + tp
 35    pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
 36    sdf = spark.createDataFrame(pdf)
 37    return sdf
 38
 39
 40def _get_categorical_categorical_data(spark):
 41    """
 42    Gets dummy categorical-categorical data in Spark dataframe.
 43
 44    :return: Spark dataframe.
 45    """
 46    x_domain = ['a', 'b', 'c']
 47    y_domain = ['a', 'b']
 48
 49    get_x = lambda: choice(x_domain)
 50    get_y = lambda: choice(y_domain)
 51    get_data = lambda: {f'x{i}': v for i, v in enumerate((get_x(), get_y(), get_x(), get_y()))}
 52
 53    pdf = pd.DataFrame([get_data() for _ in range(100)])
 54    sdf = spark.createDataFrame(pdf)
 55    return sdf
 56
 57
 58def _get_binary_continuous_data(spark):
 59    """
 60    Gets dummy `binary-continuous data <https://www.slideshare.net/MuhammadKhalil66/point-biserial-correlation-example>`_.
 61
 62    :return: Spark dataframe.
 63    """
 64    data = [
 65        (1, 10), (1, 11), (1, 6), (1, 11), (0, 4),
 66        (0, 3), (1, 12), (0, 2), (0, 2), (0, 1)
 67    ]
 68    pdf = pd.DataFrame(data, columns=['gender', 'years'])
 69    sdf = spark.createDataFrame(pdf)
 70    return sdf
 71
 72
 73def _get_concordance_data(spark):
 74    """
 75    Gets dummy concordance data.
 76
 77    :return: Spark dataframe.
 78    """
 79    a = [1, 2, 3]
 80    b = [3, 2, 1]
 81    pdf = pd.DataFrame({'a': a, 'b': b, 'c': a, 'd': b})
 82    sdf = spark.createDataFrame(pdf)
 83    return sdf
 84
 85
 86def _get_categorical_continuous_data(spark):
 87    data = [
 88        ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21),
 89        ('g', 40), ('g', 20), ('g', 30), ('g', 42),
 90        ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73)
 91    ]
 92    data = [tup * 2 for tup in data]
 93    pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
 94    sdf = spark.createDataFrame(pdf)
 95    return sdf
 96
 97
 98def _get_continuous_continuous_data(spark):
 99    """
100    Gets dummy continuous-continuous data.
101    See `site <http://onlinestatbook.com/2/describing_bivariate_data/calculation.html>`_.
102
103    :return: Spark dataframe.
104    """
105    data = [
106        (12, 9),
107        (10, 12),
108        (9, 12),
109        (14, 11),
110        (10, 8),
111        (11, 9),
112        (10, 9),
113        (10, 6),
114        (14, 12),
115        (9, 11),
116        (11, 12),
117        (10, 7),
118        (11, 13),
119        (15, 14),
120        (8, 11),
121        (11, 11),
122        (9, 8),
123        (9, 9),
124        (10, 11),
125        (12, 9),
126        (11, 12),
127        (10, 12),
128        (9, 7),
129        (7, 9),
130        (12, 14)
131    ]
132    pdf = pd.DataFrame([item * 2 for item in data], columns=['x1', 'x2', 'x3', 'x4'])
133    sdf = spark.createDataFrame(pdf)
134    return sdf
135
136
137spark = None
138
139try:
140    # create a spark session
141    spark = (SparkSession.builder
142             .master('local[4]')
143             .appName('local-testing-pyspark')
144             .getOrCreate())
145
146    # create some spark dataframes
147    bin_sdf = _get_binary_binary_data(spark)
148    con_sdf = _get_confusion_data(spark)
149    cat_sdf = _get_categorical_categorical_data(spark)
150    bcn_sdf = _get_binary_continuous_data(spark)
151    crd_sdf = _get_concordance_data(spark)
152    ccn_sdf = _get_categorical_continuous_data(spark)
153    cnt_sdf = _get_continuous_continuous_data(spark)
154
155    # call these methods to get the association measures
156    bin_results = binary_binary(bin_sdf).collect()
157    con_results = confusion(con_sdf).collect()
158    cat_results = categorical_categorical(cat_sdf).collect()
159    agr_results = agreement(bin_sdf).collect()
160    bcn_results = binary_continuous(bcn_sdf, binary=['gender'], continuous=['years']).collect()
161    crd_results = concordance(crd_sdf).collect()
162    ccn_results = categorical_continuous(ccn_sdf, ['x1', 'x3'], ['x2', 'x4']).collect()
163    cnt_results = continuous_continuous(cnt_sdf).collect()
164
165    # convert the lists to dictionaries
166    bin_results = {tup[0]: tup[1] for tup in bin_results}
167    con_results = {tup[0]: tup[1] for tup in con_results}
168    cat_results = {tup[0]: tup[1] for tup in cat_results}
169    agr_results = {tup[0]: tup[1] for tup in agr_results}
170    bcn_results = {tup[0]: tup[1] for tup in bcn_results}
171    crd_results = {tup[0]: tup[1] for tup in crd_results}
172    ccn_results = {tup[0]: tup[1] for tup in ccn_results}
173    cnt_results = {tup[0]: tup[1] for tup in cnt_results}
174
175    # pretty print
176    to_json = lambda r: json.dumps({f'{k[0]}_{k[1]}': v for k, v in r.items()}, indent=1)
177    print(to_json(bin_results))
178    print('-' * 10)
179    print(to_json(con_results))
180    print('*' * 10)
181    print(to_json(cat_results))
182    print('~' * 10)
183    print(to_json(agr_results))
184    print('-' * 10)
185    print(to_json(bcn_results))
186    print('=' * 10)
187    print(to_json(crd_results))
188    print('`' * 10)
189    print(to_json(ccn_results))
190    print('/' * 10)
191    print(to_json(cnt_results))
192except Exception as e:
193    print(e)
194finally:
195    try:
196        spark.stop()
197        print('closed spark')
198    except Exception as e:
199        print(e)