Quickstart
Installation
Use PyPi to install the package.
pip install pypair
Confusion Matrix
A confusion matrix is typically used to judge binary classification performance. There are two variables, \(A\) and \(P\), where \(A\) is the actual value (ground truth) and \(P\) is the predicted value. The example below shows how to use the convenience method confusion()
and the class ConfusionMatrix
to get association measures derived from the confusion matrix.
1from pypair.association import confusion
2from pypair.contingency import ConfusionMatrix
3
4
5def get_data():
6 """
7 Data taken from `here <https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/>`_.
8 A pair of binary variables, `a` and `p`, are returned.
9
10 :return: a, p
11 """
12 tn = [(0, 0) for _ in range(50)]
13 fp = [(0, 1) for _ in range(10)]
14 fn = [(1, 0) for _ in range(5)]
15 tp = [(1, 1) for _ in range(100)]
16 data = tn + fp + fn + tp
17 a = [a for a, _ in data]
18 p = [b for _, b in data]
19 return a, p
20
21
22a, p = get_data()
23
24# if you need to quickly get just one association measure
25r = confusion(a, p, measure='acc')
26print(r)
27
28print('-' * 15)
29
30# you can also get a list of available association measures
31# and loop over to call confusion(...)
32# this is more convenient, but less fast
33for m in ConfusionMatrix.measures():
34 r = confusion(a, p, m)
35 print(f'{r}: {m}')
36
37print('-' * 15)
38
39# if you need multiple association measures, then
40# build the confusion matrix table
41# this is less convenient, but much faster
42matrix = ConfusionMatrix(a, p)
43for m in matrix.measures():
44 r = matrix.get(m)
45 print(f'{r}: {m}')
Binary-Binary
Association measures for binary-binary variables are computed using binary_binary()
or BinaryTable
.
1from pypair.association import binary_binary
2from pypair.contingency import BinaryTable
3
4get_data = lambda x, y, n: [(x, y) for _ in range(n)]
5data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
6a = [a for a, _ in data]
7b = [b for _, b in data]
8
9for m in BinaryTable.measures():
10 r = binary_binary(a, b, m)
11 print(f'{r}: {m}')
12
13print('-' * 15)
14
15table = BinaryTable(a, b)
16for m in table.measures():
17 r = table.get(m)
18 print(f'{r}: {m}')
Categorical-Categorical
Association measures for categorical-categorical variables are computed using categorical_categorical()
or CategoricalTable
.
1from pypair.association import categorical_categorical
2from pypair.contingency import CategoricalTable
3
4get_data = lambda x, y, n: [(x, y) for _ in range(n)]
5data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
6a = [a for a, _ in data]
7b = [b for _, b in data]
8
9for m in CategoricalTable.measures():
10 r = categorical_categorical(a, b, m)
11 print(f'{r}: {m}')
12
13print('-' * 15)
14
15table = CategoricalTable(a, b)
16for m in table.measures():
17 r = table.get(m)
18 print(f'{r}: {m}')
Binary-Continuous
Association measures for binary-continuous variables are computed using binary_continuous()
or Biserial
.
1from pypair.association import binary_continuous
2from pypair.biserial import Biserial
3
4get_data = lambda x, y, n: [(x, y) for _ in range(n)]
5data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
6a = [a for a, _ in data]
7b = [b for _, b in data]
8
9for m in Biserial.measures():
10 r = binary_continuous(a, b, m)
11 print(f'{r}: {m}')
12
13print('-' * 15)
14
15biserial = Biserial(a, b)
16for m in biserial.measures():
17 r = biserial.get(m)
18 print(f'{r}: {m}')
Ordinal-Ordinal, Concordance
Concordance measures are used for ordinal-ordinal or continuous-continuous variables using concordance()
or Concordance()
.
1from pypair.association import concordance
2from pypair.continuous import Concordance
3
4a = [1, 2, 3]
5b = [3, 2, 1]
6
7for m in Concordance.measures():
8 r = concordance(a, b, m)
9 print(f'{r}: {m}')
10
11print('-' * 15)
12
13con = Concordance(a, b)
14for m in con.measures():
15 r = con.get(m)
16 print(f'{r}: {m}')
Categorical-Continuous
Categorical-continuous association measures are computed using categorical_continuous()
or CorrelationRatio
.
1from pypair.association import categorical_continuous
2from pypair.continuous import CorrelationRatio
3
4data = [
5 ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21),
6 ('g', 40), ('g', 20), ('g', 30), ('g', 42),
7 ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73)
8]
9x = [x for x, _ in data]
10y = [y for _, y in data]
11for m in CorrelationRatio.measures():
12 r = categorical_continuous(x, y, m)
13 print(f'{r}: {m}')
14
15print('-' * 15)
16
17cr = CorrelationRatio(x, y)
18for m in cr.measures():
19 r = cr.get(m)
20 print(f'{r}: {m}')
Continuous-Continuous
Association measures for continuous-continuous variables are computed using continuous_continuous()
or Continuous
.
1from pypair.association import continuous_continuous
2from pypair.continuous import Continuous
3
4x = [x for x in range(10)]
5y = [y for y in range(10)]
6
7for m in Continuous.measures():
8 r = continuous_continuous(x, y, m)
9 print(f'{r}: {m}')
10
11print('-' * 15)
12
13con = Continuous(x, y)
14for m in con.measures():
15 r = con.get(m)
16 print(f'{r}: {m}')
Recipe
Here’s a recipe in using multiprocessing to compute pairwise association with binary data.
1import pandas as pd
2import numpy as np
3import random
4from random import randint
5from pypair.association import binary_binary
6from itertools import combinations
7from multiprocessing import Pool
8
9np.random.seed(37)
10random.seed(37)
11
12def get_data(n_rows=1000, n_cols=5):
13 data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)]
14 cols = [f'x{i}' for i in range(n_cols)]
15 return pd.DataFrame(data, columns=cols)
16
17def compute(a, b, df):
18 x = df[a]
19 y = df[b]
20 return f'{a}_{b}', binary_binary(x, y, measure='jaccard')
21
22if __name__ == '__main__':
23 df = get_data()
24
25 with Pool(10) as pool:
26 pairs = ((a, b, df) for a, b in combinations(df.columns, 2))
27 bc = pool.starmap(compute, pairs)
28
29 bc = sorted(bc, key=lambda tup: tup[0])
30 print(dict(bc))
Here’s a nifty utility method to create a correlation matrix. The input data frame must be all the same type and you must supply a function. Note that Pandas DataFrame.corr()
no longer supports processing non-numeric data; fields that are not numeric will be simply skipped over. Why?
1from random import randint
2
3import pandas as pd
4
5from pypair.association import binary_binary
6from pypair.util import corr
7
8
9def get_data(n_rows=1000, n_cols=5):
10 data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)]
11 cols = [f'x{i}' for i in range(n_cols)]
12 return pd.DataFrame(data, columns=cols)
13
14
15if __name__ == '__main__':
16 jaccard = lambda a, b: binary_binary(a, b, measure='jaccard')
17 tanimoto = lambda a, b: binary_binary(a, b, measure='tanimoto_i')
18
19 df = get_data()
20 jaccard_corr = corr(df, jaccard)
21 tanimoto_corr = corr(df, tanimoto)
22
23 print(jaccard_corr)
24 print('-' * 15)
25 print(tanimoto_corr)
Apache Spark
Spark is supported for some of the association measures. Active support is appreciated. Below are some code samples to get you started.
1import json
2from random import choice
3
4import pandas as pd
5from pyspark.sql import SparkSession
6
7from pypair.spark import binary_binary, confusion, categorical_categorical, agreement, binary_continuous, concordance, \
8 categorical_continuous, continuous_continuous
9
10
11def _get_binary_binary_data(spark):
12 """
13 Gets dummy binary-binary data in a Spark dataframe.
14
15 :return: Spark dataframe.
16 """
17 get_data = lambda x, y, n: [(x, y) * 2 for _ in range(n)]
18 data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
19 pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
20 sdf = spark.createDataFrame(pdf)
21 return sdf
22
23
24def _get_confusion_data(spark):
25 """
26 Gets dummy binary-binary data in Spark dataframe. For use with confusion matrix analysis.
27
28 :return: Spark dataframe.
29 """
30 tn = [(0, 0) * 2 for _ in range(50)]
31 fp = [(0, 1) * 2 for _ in range(10)]
32 fn = [(1, 0) * 2 for _ in range(5)]
33 tp = [(1, 1) * 2 for _ in range(100)]
34 data = tn + fp + fn + tp
35 pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
36 sdf = spark.createDataFrame(pdf)
37 return sdf
38
39
40def _get_categorical_categorical_data(spark):
41 """
42 Gets dummy categorical-categorical data in Spark dataframe.
43
44 :return: Spark dataframe.
45 """
46 x_domain = ['a', 'b', 'c']
47 y_domain = ['a', 'b']
48
49 get_x = lambda: choice(x_domain)
50 get_y = lambda: choice(y_domain)
51 get_data = lambda: {f'x{i}': v for i, v in enumerate((get_x(), get_y(), get_x(), get_y()))}
52
53 pdf = pd.DataFrame([get_data() for _ in range(100)])
54 sdf = spark.createDataFrame(pdf)
55 return sdf
56
57
58def _get_binary_continuous_data(spark):
59 """
60 Gets dummy `binary-continuous data <https://www.slideshare.net/MuhammadKhalil66/point-biserial-correlation-example>`_.
61
62 :return: Spark dataframe.
63 """
64 data = [
65 (1, 10), (1, 11), (1, 6), (1, 11), (0, 4),
66 (0, 3), (1, 12), (0, 2), (0, 2), (0, 1)
67 ]
68 pdf = pd.DataFrame(data, columns=['gender', 'years'])
69 sdf = spark.createDataFrame(pdf)
70 return sdf
71
72
73def _get_concordance_data(spark):
74 """
75 Gets dummy concordance data.
76
77 :return: Spark dataframe.
78 """
79 a = [1, 2, 3]
80 b = [3, 2, 1]
81 pdf = pd.DataFrame({'a': a, 'b': b, 'c': a, 'd': b})
82 sdf = spark.createDataFrame(pdf)
83 return sdf
84
85
86def _get_categorical_continuous_data(spark):
87 data = [
88 ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21),
89 ('g', 40), ('g', 20), ('g', 30), ('g', 42),
90 ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73)
91 ]
92 data = [tup * 2 for tup in data]
93 pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
94 sdf = spark.createDataFrame(pdf)
95 return sdf
96
97
98def _get_continuous_continuous_data(spark):
99 """
100 Gets dummy continuous-continuous data.
101 See `site <http://onlinestatbook.com/2/describing_bivariate_data/calculation.html>`_.
102
103 :return: Spark dataframe.
104 """
105 data = [
106 (12, 9),
107 (10, 12),
108 (9, 12),
109 (14, 11),
110 (10, 8),
111 (11, 9),
112 (10, 9),
113 (10, 6),
114 (14, 12),
115 (9, 11),
116 (11, 12),
117 (10, 7),
118 (11, 13),
119 (15, 14),
120 (8, 11),
121 (11, 11),
122 (9, 8),
123 (9, 9),
124 (10, 11),
125 (12, 9),
126 (11, 12),
127 (10, 12),
128 (9, 7),
129 (7, 9),
130 (12, 14)
131 ]
132 pdf = pd.DataFrame([item * 2 for item in data], columns=['x1', 'x2', 'x3', 'x4'])
133 sdf = spark.createDataFrame(pdf)
134 return sdf
135
136
137spark = None
138
139try:
140 # create a spark session
141 spark = (SparkSession.builder
142 .master('local[4]')
143 .appName('local-testing-pyspark')
144 .getOrCreate())
145
146 # create some spark dataframes
147 bin_sdf = _get_binary_binary_data(spark)
148 con_sdf = _get_confusion_data(spark)
149 cat_sdf = _get_categorical_categorical_data(spark)
150 bcn_sdf = _get_binary_continuous_data(spark)
151 crd_sdf = _get_concordance_data(spark)
152 ccn_sdf = _get_categorical_continuous_data(spark)
153 cnt_sdf = _get_continuous_continuous_data(spark)
154
155 # call these methods to get the association measures
156 bin_results = binary_binary(bin_sdf).collect()
157 con_results = confusion(con_sdf).collect()
158 cat_results = categorical_categorical(cat_sdf).collect()
159 agr_results = agreement(bin_sdf).collect()
160 bcn_results = binary_continuous(bcn_sdf, binary=['gender'], continuous=['years']).collect()
161 crd_results = concordance(crd_sdf).collect()
162 ccn_results = categorical_continuous(ccn_sdf, ['x1', 'x3'], ['x2', 'x4']).collect()
163 cnt_results = continuous_continuous(cnt_sdf).collect()
164
165 # convert the lists to dictionaries
166 bin_results = {tup[0]: tup[1] for tup in bin_results}
167 con_results = {tup[0]: tup[1] for tup in con_results}
168 cat_results = {tup[0]: tup[1] for tup in cat_results}
169 agr_results = {tup[0]: tup[1] for tup in agr_results}
170 bcn_results = {tup[0]: tup[1] for tup in bcn_results}
171 crd_results = {tup[0]: tup[1] for tup in crd_results}
172 ccn_results = {tup[0]: tup[1] for tup in ccn_results}
173 cnt_results = {tup[0]: tup[1] for tup in cnt_results}
174
175 # pretty print
176 to_json = lambda r: json.dumps({f'{k[0]}_{k[1]}': v for k, v in r.items()}, indent=1)
177 print(to_json(bin_results))
178 print('-' * 10)
179 print(to_json(con_results))
180 print('*' * 10)
181 print(to_json(cat_results))
182 print('~' * 10)
183 print(to_json(agr_results))
184 print('-' * 10)
185 print(to_json(bcn_results))
186 print('=' * 10)
187 print(to_json(crd_results))
188 print('`' * 10)
189 print(to_json(ccn_results))
190 print('/' * 10)
191 print(to_json(cnt_results))
192except Exception as e:
193 print(e)
194finally:
195 try:
196 spark.stop()
197 print('closed spark')
198 except Exception as e:
199 print(e)