Quickstart

Installation

Use uv with Python 3.13 to create a reproducible development environment.

uv venv --python 3.13
source .venv/bin/activate
uv sync

Confusion Matrix

A confusion matrix is typically used to judge binary classification performance. There are two variables, \(A\) and \(P\), where \(A\) is the actual value (ground truth) and \(P\) is the predicted value. The example below shows how to use the convenience method confusion() and the class ConfusionMatrix to get association measures derived from the confusion matrix.

 1from pypair.association import confusion
 2from pypair.contingency import ConfusionMatrix
 3
 4
 5def get_data():
 6    """
 7    Data taken from `here <https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/>`_.
 8    A pair of binary variables, `a` and `p`, are returned.
 9
10    :return: a, p
11    """
12    tn = [(0, 0) for _ in range(50)]
13    fp = [(0, 1) for _ in range(10)]
14    fn = [(1, 0) for _ in range(5)]
15    tp = [(1, 1) for _ in range(100)]
16    data = tn + fp + fn + tp
17    a = [a for a, _ in data]
18    p = [b for _, b in data]
19    return a, p
20
21
22a, p = get_data()
23
24# if you need to quickly get just one association measure
25r = confusion(a, p, measure="acc")
26print(r)
27
28print("-" * 15)
29
30# you can also get a list of available association measures
31# and loop over to call confusion(...)
32# this is more convenient, but less fast
33for m in ConfusionMatrix.measures():
34    r = confusion(a, p, m)
35    print(f"{r}: {m}")
36
37print("-" * 15)
38
39# if you need multiple association measures, then
40# build the confusion matrix table
41# this is less convenient, but much faster
42matrix = ConfusionMatrix(a, p)
43for m in matrix.measures():
44    r = matrix.get(m)
45    print(f"{r}: {m}")

Binary-Binary

Association measures for binary-binary variables are computed using binary_binary() or BinaryTable.

 1from pypair.association import binary_binary
 2from pypair.contingency import BinaryTable
 3
 4
 5def get_data(x, y, n):
 6    return [(x, y) for _ in range(n)]
 7
 8
 9data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
10a = [a for a, _ in data]
11b = [b for _, b in data]
12
13for m in BinaryTable.measures():
14    r = binary_binary(a, b, m)
15    print(f"{r}: {m}")
16
17print("-" * 15)
18
19table = BinaryTable(a, b)
20for m in table.measures():
21    r = table.get(m)
22    print(f"{r}: {m}")

Categorical-Categorical

Association measures for categorical-categorical variables are computed using categorical_categorical() or CategoricalTable.

 1from pypair.association import categorical_categorical
 2from pypair.contingency import CategoricalTable
 3
 4
 5def get_data(x, y, n):
 6    return [(x, y) for _ in range(n)]
 7
 8
 9data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
10a = [a for a, _ in data]
11b = [b for _, b in data]
12
13for m in CategoricalTable.measures():
14    r = categorical_categorical(a, b, m)
15    print(f"{r}: {m}")
16
17print("-" * 15)
18
19table = CategoricalTable(a, b)
20for m in table.measures():
21    r = table.get(m)
22    print(f"{r}: {m}")

Binary-Continuous

Association measures for binary-continuous variables are computed using binary_continuous() or Biserial.

 1from pypair.association import binary_continuous
 2from pypair.biserial import Biserial
 3
 4
 5def get_data(x, y, n):
 6    return [(x, y) for _ in range(n)]
 7
 8
 9data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
10a = [a for a, _ in data]
11b = [b for _, b in data]
12
13for m in Biserial.measures():
14    r = binary_continuous(a, b, m)
15    print(f"{r}: {m}")
16
17print("-" * 15)
18
19biserial = Biserial(a, b)
20for m in biserial.measures():
21    r = biserial.get(m)
22    print(f"{r}: {m}")

Ordinal-Ordinal, Concordance

Concordance measures are used for ordinal-ordinal or continuous-continuous variables using concordance() or Concordance().

 1from pypair.association import concordance
 2from pypair.continuous import Concordance
 3
 4a = [1, 2, 3]
 5b = [3, 2, 1]
 6
 7for m in Concordance.measures():
 8    r = concordance(a, b, m)
 9    print(f"{r}: {m}")
10
11print("-" * 15)
12
13con = Concordance(a, b)
14for m in con.measures():
15    r = con.get(m)
16    print(f"{r}: {m}")

Categorical-Continuous

Categorical-continuous association measures are computed using categorical_continuous() or CorrelationRatio.

 1from pypair.association import categorical_continuous
 2from pypair.continuous import CorrelationRatio
 3
 4data = [
 5    ("a", 45),
 6    ("a", 70),
 7    ("a", 29),
 8    ("a", 15),
 9    ("a", 21),
10    ("g", 40),
11    ("g", 20),
12    ("g", 30),
13    ("g", 42),
14    ("s", 65),
15    ("s", 95),
16    ("s", 80),
17    ("s", 70),
18    ("s", 85),
19    ("s", 73),
20]
21x = [x for x, _ in data]
22y = [y for _, y in data]
23for m in CorrelationRatio.measures():
24    r = categorical_continuous(x, y, m)
25    print(f"{r}: {m}")
26
27print("-" * 15)
28
29cr = CorrelationRatio(x, y)
30for m in cr.measures():
31    r = cr.get(m)
32    print(f"{r}: {m}")

Continuous-Continuous

Association measures for continuous-continuous variables are computed using continuous_continuous() or Continuous.

 1from pypair.association import continuous_continuous
 2from pypair.continuous import Continuous
 3
 4x = [x for x in range(10)]
 5y = [y for y in range(10)]
 6
 7for m in Continuous.measures():
 8    r = continuous_continuous(x, y, m)
 9    print(f"{r}: {m}")
10
11print("-" * 15)
12
13con = Continuous(x, y)
14for m in con.measures():
15    r = con.get(m)
16    print(f"{r}: {m}")

Recipe

Here’s a recipe in using multiprocessing to compute pairwise association with binary data.

 1import pandas as pd
 2import numpy as np
 3import random
 4from random import randint
 5from pypair.association import binary_binary
 6from itertools import combinations
 7from multiprocessing import Pool
 8
 9np.random.seed(37)
10random.seed(37)
11
12
13def get_data(n_rows=1000, n_cols=5):
14    data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)]
15    cols = [f"x{i}" for i in range(n_cols)]
16    return pd.DataFrame(data, columns=cols)
17
18
19def compute(a, b, df):
20    x = df[a]
21    y = df[b]
22    return f"{a}_{b}", binary_binary(x, y, measure="jaccard")
23
24
25if __name__ == "__main__":
26    df = get_data()
27
28    with Pool(10) as pool:
29        pairs = ((a, b, df) for a, b in combinations(df.columns, 2))
30        bc = pool.starmap(compute, pairs)
31
32    bc = sorted(bc, key=lambda tup: tup[0])
33    print(dict(bc))

Here’s a nifty utility method to create a correlation matrix. The input data frame must be all the same type and you must supply a function. Note that Pandas DataFrame.corr() no longer supports processing non-numeric data; fields that are not numeric will be simply skipped over. Why?

 1from random import randint
 2
 3import pandas as pd
 4
 5from pypair.association import binary_binary
 6from pypair.util import corr
 7
 8
 9def get_data(n_rows=1000, n_cols=5):
10    data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)]
11    cols = [f"x{i}" for i in range(n_cols)]
12    return pd.DataFrame(data, columns=cols)
13
14
15if __name__ == "__main__":
16
17    def jaccard(a, b):
18        return binary_binary(a, b, measure="jaccard")
19
20    def tanimoto(a, b):
21        return binary_binary(a, b, measure="tanimoto_i")
22
23    df = get_data()
24    jaccard_corr = corr(df, jaccard)
25    tanimoto_corr = corr(df, tanimoto)
26
27    print(jaccard_corr)
28    print("-" * 15)
29    print(tanimoto_corr)

Apache Spark

Spark is supported for some of the association measures. Active support is appreciated. Below are some code samples to get you started.

  1import json
  2from random import choice
  3
  4import pandas as pd
  5from pyspark.sql import SparkSession
  6
  7from pypair.spark import (
  8    binary_binary,
  9    confusion,
 10    categorical_categorical,
 11    agreement,
 12    binary_continuous,
 13    concordance,
 14    categorical_continuous,
 15    continuous_continuous,
 16)
 17
 18
 19def _get_binary_binary_data(spark):
 20    """
 21    Gets dummy binary-binary data in a Spark dataframe.
 22
 23    :return: Spark dataframe.
 24    """
 25
 26    def get_data(x, y, n):
 27        return [(x, y) * 2 for _ in range(n)]
 28
 29    data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
 30    pdf = pd.DataFrame(data, columns=["x1", "x2", "x3", "x4"])
 31    sdf = spark.createDataFrame(pdf)
 32    return sdf
 33
 34
 35def _get_confusion_data(spark):
 36    """
 37    Gets dummy binary-binary data in Spark dataframe. For use with confusion matrix analysis.
 38
 39    :return: Spark dataframe.
 40    """
 41    tn = [(0, 0) * 2 for _ in range(50)]
 42    fp = [(0, 1) * 2 for _ in range(10)]
 43    fn = [(1, 0) * 2 for _ in range(5)]
 44    tp = [(1, 1) * 2 for _ in range(100)]
 45    data = tn + fp + fn + tp
 46    pdf = pd.DataFrame(data, columns=["x1", "x2", "x3", "x4"])
 47    sdf = spark.createDataFrame(pdf)
 48    return sdf
 49
 50
 51def _get_categorical_categorical_data(spark):
 52    """
 53    Gets dummy categorical-categorical data in Spark dataframe.
 54
 55    :return: Spark dataframe.
 56    """
 57    x_domain = ["a", "b", "c"]
 58    y_domain = ["a", "b"]
 59
 60    def get_x():
 61        return choice(x_domain)
 62
 63    def get_y():
 64        return choice(y_domain)
 65
 66    def get_data():
 67        return {f"x{i}": v for i, v in enumerate((get_x(), get_y(), get_x(), get_y()))}
 68
 69    pdf = pd.DataFrame([get_data() for _ in range(100)])
 70    sdf = spark.createDataFrame(pdf)
 71    return sdf
 72
 73
 74def _get_binary_continuous_data(spark):
 75    """
 76    Gets dummy `binary-continuous data <https://www.slideshare.net/MuhammadKhalil66/point-biserial-correlation-example>`_.
 77
 78    :return: Spark dataframe.
 79    """
 80    data = [(1, 10), (1, 11), (1, 6), (1, 11), (0, 4), (0, 3), (1, 12), (0, 2), (0, 2), (0, 1)]
 81    pdf = pd.DataFrame(data, columns=["gender", "years"])
 82    sdf = spark.createDataFrame(pdf)
 83    return sdf
 84
 85
 86def _get_concordance_data(spark):
 87    """
 88    Gets dummy concordance data.
 89
 90    :return: Spark dataframe.
 91    """
 92    a = [1, 2, 3]
 93    b = [3, 2, 1]
 94    pdf = pd.DataFrame({"a": a, "b": b, "c": a, "d": b})
 95    sdf = spark.createDataFrame(pdf)
 96    return sdf
 97
 98
 99def _get_categorical_continuous_data(spark):
100    data = [
101        ("a", 45),
102        ("a", 70),
103        ("a", 29),
104        ("a", 15),
105        ("a", 21),
106        ("g", 40),
107        ("g", 20),
108        ("g", 30),
109        ("g", 42),
110        ("s", 65),
111        ("s", 95),
112        ("s", 80),
113        ("s", 70),
114        ("s", 85),
115        ("s", 73),
116    ]
117    data = [tup * 2 for tup in data]
118    pdf = pd.DataFrame(data, columns=["x1", "x2", "x3", "x4"])
119    sdf = spark.createDataFrame(pdf)
120    return sdf
121
122
123def _get_continuous_continuous_data(spark):
124    """
125    Gets dummy continuous-continuous data.
126    See `site <http://onlinestatbook.com/2/describing_bivariate_data/calculation.html>`_.
127
128    :return: Spark dataframe.
129    """
130    data = [
131        (12, 9),
132        (10, 12),
133        (9, 12),
134        (14, 11),
135        (10, 8),
136        (11, 9),
137        (10, 9),
138        (10, 6),
139        (14, 12),
140        (9, 11),
141        (11, 12),
142        (10, 7),
143        (11, 13),
144        (15, 14),
145        (8, 11),
146        (11, 11),
147        (9, 8),
148        (9, 9),
149        (10, 11),
150        (12, 9),
151        (11, 12),
152        (10, 12),
153        (9, 7),
154        (7, 9),
155        (12, 14),
156    ]
157    pdf = pd.DataFrame([item * 2 for item in data], columns=["x1", "x2", "x3", "x4"])
158    sdf = spark.createDataFrame(pdf)
159    return sdf
160
161
162spark = None
163
164try:
165    # create a spark session
166    spark = SparkSession.builder.master("local[4]").appName("local-testing-pyspark").getOrCreate()
167
168    # create some spark dataframes
169    bin_sdf = _get_binary_binary_data(spark)
170    con_sdf = _get_confusion_data(spark)
171    cat_sdf = _get_categorical_categorical_data(spark)
172    bcn_sdf = _get_binary_continuous_data(spark)
173    crd_sdf = _get_concordance_data(spark)
174    ccn_sdf = _get_categorical_continuous_data(spark)
175    cnt_sdf = _get_continuous_continuous_data(spark)
176
177    # call these methods to get the association measures
178    bin_results = binary_binary(bin_sdf).collect()
179    con_results = confusion(con_sdf).collect()
180    cat_results = categorical_categorical(cat_sdf).collect()
181    agr_results = agreement(bin_sdf).collect()
182    bcn_results = binary_continuous(bcn_sdf, binary=["gender"], continuous=["years"]).collect()
183    crd_results = concordance(crd_sdf).collect()
184    ccn_results = categorical_continuous(ccn_sdf, ["x1", "x3"], ["x2", "x4"]).collect()
185    cnt_results = continuous_continuous(cnt_sdf).collect()
186
187    # convert the lists to dictionaries
188    bin_results = {tup[0]: tup[1] for tup in bin_results}
189    con_results = {tup[0]: tup[1] for tup in con_results}
190    cat_results = {tup[0]: tup[1] for tup in cat_results}
191    agr_results = {tup[0]: tup[1] for tup in agr_results}
192    bcn_results = {tup[0]: tup[1] for tup in bcn_results}
193    crd_results = {tup[0]: tup[1] for tup in crd_results}
194    ccn_results = {tup[0]: tup[1] for tup in ccn_results}
195    cnt_results = {tup[0]: tup[1] for tup in cnt_results}
196
197    # pretty print
198    def to_json(results):
199        return json.dumps({f"{k[0]}_{k[1]}": v for k, v in results.items()}, indent=1)
200
201    print(to_json(bin_results))
202    print("-" * 10)
203    print(to_json(con_results))
204    print("*" * 10)
205    print(to_json(cat_results))
206    print("~" * 10)
207    print(to_json(agr_results))
208    print("-" * 10)
209    print(to_json(bcn_results))
210    print("=" * 10)
211    print(to_json(crd_results))
212    print("`" * 10)
213    print(to_json(ccn_results))
214    print("/" * 10)
215    print(to_json(cnt_results))
216except Exception as e:
217    print(e)
218finally:
219    try:
220        spark.stop()
221        print("closed spark")
222    except Exception as e:
223        print(e)