Quickstart
Installation
Use uv with Python 3.13 to create a reproducible development environment.
uv venv --python 3.13
source .venv/bin/activate
uv sync
Confusion Matrix
A confusion matrix is typically used to judge binary classification performance. There are two variables, \(A\) and \(P\), where \(A\) is the actual value (ground truth) and \(P\) is the predicted value. The example below shows how to use the convenience method confusion() and the class ConfusionMatrix to get association measures derived from the confusion matrix.
1from pypair.association import confusion
2from pypair.contingency import ConfusionMatrix
3
4
5def get_data():
6 """
7 Data taken from `here <https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/>`_.
8 A pair of binary variables, `a` and `p`, are returned.
9
10 :return: a, p
11 """
12 tn = [(0, 0) for _ in range(50)]
13 fp = [(0, 1) for _ in range(10)]
14 fn = [(1, 0) for _ in range(5)]
15 tp = [(1, 1) for _ in range(100)]
16 data = tn + fp + fn + tp
17 a = [a for a, _ in data]
18 p = [b for _, b in data]
19 return a, p
20
21
22a, p = get_data()
23
24# if you need to quickly get just one association measure
25r = confusion(a, p, measure="acc")
26print(r)
27
28print("-" * 15)
29
30# you can also get a list of available association measures
31# and loop over to call confusion(...)
32# this is more convenient, but less fast
33for m in ConfusionMatrix.measures():
34 r = confusion(a, p, m)
35 print(f"{r}: {m}")
36
37print("-" * 15)
38
39# if you need multiple association measures, then
40# build the confusion matrix table
41# this is less convenient, but much faster
42matrix = ConfusionMatrix(a, p)
43for m in matrix.measures():
44 r = matrix.get(m)
45 print(f"{r}: {m}")
Binary-Binary
Association measures for binary-binary variables are computed using binary_binary() or BinaryTable.
1from pypair.association import binary_binary
2from pypair.contingency import BinaryTable
3
4
5def get_data(x, y, n):
6 return [(x, y) for _ in range(n)]
7
8
9data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
10a = [a for a, _ in data]
11b = [b for _, b in data]
12
13for m in BinaryTable.measures():
14 r = binary_binary(a, b, m)
15 print(f"{r}: {m}")
16
17print("-" * 15)
18
19table = BinaryTable(a, b)
20for m in table.measures():
21 r = table.get(m)
22 print(f"{r}: {m}")
Categorical-Categorical
Association measures for categorical-categorical variables are computed using categorical_categorical() or CategoricalTable.
1from pypair.association import categorical_categorical
2from pypair.contingency import CategoricalTable
3
4
5def get_data(x, y, n):
6 return [(x, y) for _ in range(n)]
7
8
9data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
10a = [a for a, _ in data]
11b = [b for _, b in data]
12
13for m in CategoricalTable.measures():
14 r = categorical_categorical(a, b, m)
15 print(f"{r}: {m}")
16
17print("-" * 15)
18
19table = CategoricalTable(a, b)
20for m in table.measures():
21 r = table.get(m)
22 print(f"{r}: {m}")
Binary-Continuous
Association measures for binary-continuous variables are computed using binary_continuous() or Biserial.
1from pypair.association import binary_continuous
2from pypair.biserial import Biserial
3
4
5def get_data(x, y, n):
6 return [(x, y) for _ in range(n)]
7
8
9data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
10a = [a for a, _ in data]
11b = [b for _, b in data]
12
13for m in Biserial.measures():
14 r = binary_continuous(a, b, m)
15 print(f"{r}: {m}")
16
17print("-" * 15)
18
19biserial = Biserial(a, b)
20for m in biserial.measures():
21 r = biserial.get(m)
22 print(f"{r}: {m}")
Ordinal-Ordinal, Concordance
Concordance measures are used for ordinal-ordinal or continuous-continuous variables using concordance() or Concordance().
1from pypair.association import concordance
2from pypair.continuous import Concordance
3
4a = [1, 2, 3]
5b = [3, 2, 1]
6
7for m in Concordance.measures():
8 r = concordance(a, b, m)
9 print(f"{r}: {m}")
10
11print("-" * 15)
12
13con = Concordance(a, b)
14for m in con.measures():
15 r = con.get(m)
16 print(f"{r}: {m}")
Categorical-Continuous
Categorical-continuous association measures are computed using categorical_continuous() or CorrelationRatio.
1from pypair.association import categorical_continuous
2from pypair.continuous import CorrelationRatio
3
4data = [
5 ("a", 45),
6 ("a", 70),
7 ("a", 29),
8 ("a", 15),
9 ("a", 21),
10 ("g", 40),
11 ("g", 20),
12 ("g", 30),
13 ("g", 42),
14 ("s", 65),
15 ("s", 95),
16 ("s", 80),
17 ("s", 70),
18 ("s", 85),
19 ("s", 73),
20]
21x = [x for x, _ in data]
22y = [y for _, y in data]
23for m in CorrelationRatio.measures():
24 r = categorical_continuous(x, y, m)
25 print(f"{r}: {m}")
26
27print("-" * 15)
28
29cr = CorrelationRatio(x, y)
30for m in cr.measures():
31 r = cr.get(m)
32 print(f"{r}: {m}")
Continuous-Continuous
Association measures for continuous-continuous variables are computed using continuous_continuous() or Continuous.
1from pypair.association import continuous_continuous
2from pypair.continuous import Continuous
3
4x = [x for x in range(10)]
5y = [y for y in range(10)]
6
7for m in Continuous.measures():
8 r = continuous_continuous(x, y, m)
9 print(f"{r}: {m}")
10
11print("-" * 15)
12
13con = Continuous(x, y)
14for m in con.measures():
15 r = con.get(m)
16 print(f"{r}: {m}")
Recipe
Here’s a recipe in using multiprocessing to compute pairwise association with binary data.
1import pandas as pd
2import numpy as np
3import random
4from random import randint
5from pypair.association import binary_binary
6from itertools import combinations
7from multiprocessing import Pool
8
9np.random.seed(37)
10random.seed(37)
11
12
13def get_data(n_rows=1000, n_cols=5):
14 data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)]
15 cols = [f"x{i}" for i in range(n_cols)]
16 return pd.DataFrame(data, columns=cols)
17
18
19def compute(a, b, df):
20 x = df[a]
21 y = df[b]
22 return f"{a}_{b}", binary_binary(x, y, measure="jaccard")
23
24
25if __name__ == "__main__":
26 df = get_data()
27
28 with Pool(10) as pool:
29 pairs = ((a, b, df) for a, b in combinations(df.columns, 2))
30 bc = pool.starmap(compute, pairs)
31
32 bc = sorted(bc, key=lambda tup: tup[0])
33 print(dict(bc))
Here’s a nifty utility method to create a correlation matrix. The input data frame must be all the same type and you must supply a function. Note that Pandas DataFrame.corr() no longer supports processing non-numeric data; fields that are not numeric will be simply skipped over. Why?
1from random import randint
2
3import pandas as pd
4
5from pypair.association import binary_binary
6from pypair.util import corr
7
8
9def get_data(n_rows=1000, n_cols=5):
10 data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)]
11 cols = [f"x{i}" for i in range(n_cols)]
12 return pd.DataFrame(data, columns=cols)
13
14
15if __name__ == "__main__":
16
17 def jaccard(a, b):
18 return binary_binary(a, b, measure="jaccard")
19
20 def tanimoto(a, b):
21 return binary_binary(a, b, measure="tanimoto_i")
22
23 df = get_data()
24 jaccard_corr = corr(df, jaccard)
25 tanimoto_corr = corr(df, tanimoto)
26
27 print(jaccard_corr)
28 print("-" * 15)
29 print(tanimoto_corr)
Apache Spark
Spark is supported for some of the association measures. Active support is appreciated. Below are some code samples to get you started.
1import json
2from random import choice
3
4import pandas as pd
5from pyspark.sql import SparkSession
6
7from pypair.spark import (
8 binary_binary,
9 confusion,
10 categorical_categorical,
11 agreement,
12 binary_continuous,
13 concordance,
14 categorical_continuous,
15 continuous_continuous,
16)
17
18
19def _get_binary_binary_data(spark):
20 """
21 Gets dummy binary-binary data in a Spark dataframe.
22
23 :return: Spark dataframe.
24 """
25
26 def get_data(x, y, n):
27 return [(x, y) * 2 for _ in range(n)]
28
29 data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
30 pdf = pd.DataFrame(data, columns=["x1", "x2", "x3", "x4"])
31 sdf = spark.createDataFrame(pdf)
32 return sdf
33
34
35def _get_confusion_data(spark):
36 """
37 Gets dummy binary-binary data in Spark dataframe. For use with confusion matrix analysis.
38
39 :return: Spark dataframe.
40 """
41 tn = [(0, 0) * 2 for _ in range(50)]
42 fp = [(0, 1) * 2 for _ in range(10)]
43 fn = [(1, 0) * 2 for _ in range(5)]
44 tp = [(1, 1) * 2 for _ in range(100)]
45 data = tn + fp + fn + tp
46 pdf = pd.DataFrame(data, columns=["x1", "x2", "x3", "x4"])
47 sdf = spark.createDataFrame(pdf)
48 return sdf
49
50
51def _get_categorical_categorical_data(spark):
52 """
53 Gets dummy categorical-categorical data in Spark dataframe.
54
55 :return: Spark dataframe.
56 """
57 x_domain = ["a", "b", "c"]
58 y_domain = ["a", "b"]
59
60 def get_x():
61 return choice(x_domain)
62
63 def get_y():
64 return choice(y_domain)
65
66 def get_data():
67 return {f"x{i}": v for i, v in enumerate((get_x(), get_y(), get_x(), get_y()))}
68
69 pdf = pd.DataFrame([get_data() for _ in range(100)])
70 sdf = spark.createDataFrame(pdf)
71 return sdf
72
73
74def _get_binary_continuous_data(spark):
75 """
76 Gets dummy `binary-continuous data <https://www.slideshare.net/MuhammadKhalil66/point-biserial-correlation-example>`_.
77
78 :return: Spark dataframe.
79 """
80 data = [(1, 10), (1, 11), (1, 6), (1, 11), (0, 4), (0, 3), (1, 12), (0, 2), (0, 2), (0, 1)]
81 pdf = pd.DataFrame(data, columns=["gender", "years"])
82 sdf = spark.createDataFrame(pdf)
83 return sdf
84
85
86def _get_concordance_data(spark):
87 """
88 Gets dummy concordance data.
89
90 :return: Spark dataframe.
91 """
92 a = [1, 2, 3]
93 b = [3, 2, 1]
94 pdf = pd.DataFrame({"a": a, "b": b, "c": a, "d": b})
95 sdf = spark.createDataFrame(pdf)
96 return sdf
97
98
99def _get_categorical_continuous_data(spark):
100 data = [
101 ("a", 45),
102 ("a", 70),
103 ("a", 29),
104 ("a", 15),
105 ("a", 21),
106 ("g", 40),
107 ("g", 20),
108 ("g", 30),
109 ("g", 42),
110 ("s", 65),
111 ("s", 95),
112 ("s", 80),
113 ("s", 70),
114 ("s", 85),
115 ("s", 73),
116 ]
117 data = [tup * 2 for tup in data]
118 pdf = pd.DataFrame(data, columns=["x1", "x2", "x3", "x4"])
119 sdf = spark.createDataFrame(pdf)
120 return sdf
121
122
123def _get_continuous_continuous_data(spark):
124 """
125 Gets dummy continuous-continuous data.
126 See `site <http://onlinestatbook.com/2/describing_bivariate_data/calculation.html>`_.
127
128 :return: Spark dataframe.
129 """
130 data = [
131 (12, 9),
132 (10, 12),
133 (9, 12),
134 (14, 11),
135 (10, 8),
136 (11, 9),
137 (10, 9),
138 (10, 6),
139 (14, 12),
140 (9, 11),
141 (11, 12),
142 (10, 7),
143 (11, 13),
144 (15, 14),
145 (8, 11),
146 (11, 11),
147 (9, 8),
148 (9, 9),
149 (10, 11),
150 (12, 9),
151 (11, 12),
152 (10, 12),
153 (9, 7),
154 (7, 9),
155 (12, 14),
156 ]
157 pdf = pd.DataFrame([item * 2 for item in data], columns=["x1", "x2", "x3", "x4"])
158 sdf = spark.createDataFrame(pdf)
159 return sdf
160
161
162spark = None
163
164try:
165 # create a spark session
166 spark = SparkSession.builder.master("local[4]").appName("local-testing-pyspark").getOrCreate()
167
168 # create some spark dataframes
169 bin_sdf = _get_binary_binary_data(spark)
170 con_sdf = _get_confusion_data(spark)
171 cat_sdf = _get_categorical_categorical_data(spark)
172 bcn_sdf = _get_binary_continuous_data(spark)
173 crd_sdf = _get_concordance_data(spark)
174 ccn_sdf = _get_categorical_continuous_data(spark)
175 cnt_sdf = _get_continuous_continuous_data(spark)
176
177 # call these methods to get the association measures
178 bin_results = binary_binary(bin_sdf).collect()
179 con_results = confusion(con_sdf).collect()
180 cat_results = categorical_categorical(cat_sdf).collect()
181 agr_results = agreement(bin_sdf).collect()
182 bcn_results = binary_continuous(bcn_sdf, binary=["gender"], continuous=["years"]).collect()
183 crd_results = concordance(crd_sdf).collect()
184 ccn_results = categorical_continuous(ccn_sdf, ["x1", "x3"], ["x2", "x4"]).collect()
185 cnt_results = continuous_continuous(cnt_sdf).collect()
186
187 # convert the lists to dictionaries
188 bin_results = {tup[0]: tup[1] for tup in bin_results}
189 con_results = {tup[0]: tup[1] for tup in con_results}
190 cat_results = {tup[0]: tup[1] for tup in cat_results}
191 agr_results = {tup[0]: tup[1] for tup in agr_results}
192 bcn_results = {tup[0]: tup[1] for tup in bcn_results}
193 crd_results = {tup[0]: tup[1] for tup in crd_results}
194 ccn_results = {tup[0]: tup[1] for tup in ccn_results}
195 cnt_results = {tup[0]: tup[1] for tup in cnt_results}
196
197 # pretty print
198 def to_json(results):
199 return json.dumps({f"{k[0]}_{k[1]}": v for k, v in results.items()}, indent=1)
200
201 print(to_json(bin_results))
202 print("-" * 10)
203 print(to_json(con_results))
204 print("*" * 10)
205 print(to_json(cat_results))
206 print("~" * 10)
207 print(to_json(agr_results))
208 print("-" * 10)
209 print(to_json(bcn_results))
210 print("=" * 10)
211 print(to_json(crd_results))
212 print("`" * 10)
213 print(to_json(ccn_results))
214 print("/" * 10)
215 print(to_json(cnt_results))
216except Exception as e:
217 print(e)
218finally:
219 try:
220 spark.stop()
221 print("closed spark")
222 except Exception as e:
223 print(e)