utils/t_test.py

import numpy as np
from scipy.stats import ttest_ind

topn_data_sample = {
    'Toys': {
        'HR@1':     [[0.0650, 0.0655, 0.0657, 0.0688, 0.0641, 0.0685, 0.0671, 0.0649, 0.0604, 0.0695],
                     [0.0579, 0.0579, 0.0595, 0.0572, 0.0612, 0.0574, 0.0596, 0.0583, 0.0563, 0.0537]],
        'HR@5':     [[0.1607, 0.1644, 0.1649, 0.1726, 0.1623, 0.1624, 0.1695, 0.1695, 0.1619, 0.1667],
                     [0.1461, 0.1480, 0.1447, 0.1444, 0.1534, 0.1473, 0.1533, 0.145, 0.1456, 0.1335]],
        'NDCG@5':  [[0.1142, 0.1164, 0.1165, 0.1219, 0.1145, 0.1173, 0.1192, 0.1188, 0.1124, 0.1195],
                     [0.1029, 0.1042, 0.1031, 0.1018, 0.1085, 0.1031, 0.1073, 0.1024, 0.1014, 0.0942]],
        'HR@10':    [[0.2325, 0.2380, 0.2343, 0.2430, 0.2334, 0.2321, 0.2449, 0.2399, 0.2360, 0.2407],
                     [0.2119, 0.2087, 0.2134, 0.2131, 0.218, 0.2176, 0.22, 0.2141, 0.2049, 0.1969]],
        'NDCG@10': [[0.1370, 0.1395, 0.1383, 0.1441, 0.1370, 0.1393, 0.1431, 0.1411, 0.1359, 0.1429],
                     [0.1244, 0.1245, 0.125, 0.1237, 0.1289, 0.1254, 0.1284, 0.1243, 0.1251, 0.1142]],
    },

    'Beauty': {
        'HR@1':     [[0.1164, 0.1178, 0.1193, 0.1219, 0.1206, 0.1132, 0.1202, 0.1221, 0.1276, 0.1237],
                    [0.0846, 0.0827, 0.0893, 0.0816, 0.0827, 0.0872, 0.0852, 0.0855, 0.0819, 0.0853]],
        'HR@5':     [[0.2513, 0.2555, 0.2572, 0.2568, 0.2568, 0.2478, 0.2550, 0.2565, 0.2710, 0.2641],
                    [0.1931, 0.1915, 0.1996, 0.1837, 0.1915, 0.2017, 0.1941, 0.192, 0.1901, 0.1937]],
        'NDCG@5':  [[0.1854, 0.1882, 0.1904, 0.1910, 0.1905, 0.1830, 0.1896, 0.1914, 0.2014, 0.1960],
                    [0.1404, 0.1382, 0.1465, 0.1344, 0.1382, 0.146, 0.1412, 0.1401, 0.138, 0.1409]],
        'HR@10':    [[0.3337, 0.3352, 0.3361, 0.3369, 0.3374, 0.3275, 0.3349, 0.3386, 0.3562, 0.3435],
                    [0.2677, 0.2649, 0.273, 0.2591, 0.2649, 0.2777, 0.2687, 0.2676, 0.2653, 0.2678]],
        'NDCG@10': [[0.2113, 0.2133, 0.2153, 0.2157, 0.2156, 0.2084, 0.2145, 0.2172, 0.2280, 0.2210],
                    [0.1639, 0.1614, 0.1699, 0.1584, 0.1614, 0.1701, 0.165, 0.1642, 0.1599, 0.1645]],
    },
    'Sports': {
        'R@1':     [[0.1309, 0.1289, 0.1265, 0.1275, 0.1275, 0.1238, 0.1217, 0.1321, 0.1304, 0.1358],
                    [0.0927, 0.0895, 0.0974, 0.0899, 0.0945, 0.0977, 0.0938, 0.0886, 0.0931, 0.0898]],
        'R@5':     [[0.2810, 0.2788, 0.2725, 0.2791, 0.2644, 0.2764, 0.2615, 0.2744, 0.2726, 0.2864],
                    [0.2105, 0.2086, 0.22, 0.206, 0.2104, 0.2166, 0.2123, 0.2042, 0.2112, 0.2056]],
        'NDCG@5':  [[0.2081, 0.2059, 0.2004, 0.2048, 0.1976, 0.2014, 0.1936, 0.2049, 0.2035, 0.2128],
                    [0.1539, 0.1506, 0.1604, 0.1494, 0.1542, 0.1589, 0.1547, 0.1476, 0.1537, 0.1555]],
        'R@10':    [[0.2889, 0.2873, 0.2977, 0.2822, 0.2889, 0.2948, 0.2902, 0.2822, 0.2890, 0.2878],
                    [0.0393, 0.0385, 0.0402, 0.0394, 0.0378, 0.0416, 0.0369, 0.0382, 0.0372, 0.0372,]],
        'NDCG@10': [[0.2385, 0.2359, 0.2296, 0.2337, 0.2249, 0.2300, 0.2226, 0.2336, 0.2338, 0.2429],
                    [0.1782, 0.1756, 0.185, 0.1736, 0.179, 0.1836, 0.1794, 0.1722, 0.1783, 0.1769]],
    },
}

alpha = 0.05

# for t in seq_data_sample:
#     print('>>>>>>>>>>>>>>>>>>>', t)
#     for t1 in seq_data_sample[t]:
#         if len(seq_data_sample[t][t1]) == 2:
#
#             data = seq_data_sample[t][t1]
#
#             if len(data[0]) == 0 or len(data[1]) == 0: continue
#
#             t_statistic, p_value = ttest_ind(data[0], data[1])
#
#             alpha = 0.05
#
#             # print('avg', np.mean(data[0]), np.mean(data[1]), 'imp', (np.mean(data[0])-np.mean(data[1]))/np.mean(data[1]))
#
#             if p_value < alpha:
#                 print('>>>>>>>', t1, p_value, 'significant')
#                 # print("Reject null hypothesis: There is a significant difference between the means.")
#             else:
#                 print('>>>>>>>', t1, p_value, 'not significant')
#                 # print("Fail to reject null hypothesis: There is no significant difference between the means.")


seq_data_sample = {
    'Toys': {
        'HR@5':     [[0.0733, 0.0747, 0.0720, 0.0730, 0.0695, 0.0733, 0.0750, 0.0699, 0.0702, 0.0721],
                     [0.0695, 0.0691, 0.072, 0.0696, 0.0712, 0.0711, 0.0699, 0.063, 0.0696, 0.0696]],
        'NDCG@5':  [[0.0598, 0.0613, 0.0584, 0.0592, 0.0588, 0.0591, 0.0611, 0.0587, 0.0569, 0.0594],
                     [0.0589, 0.0599, 0.0602, 0.0591, 0.0601, 0.0583, 0.0589, 0.0556, 0.0598, 0.0586]],
        'HR@10':    [[0.0819, 0.0823, 0.0800, 0.0818, 0.0773, 0.0801, 0.0818, 0.0809, 0.0781, 0.0779],
                     [0.0749, 0.0742, 0.0768, 0.075, 0.077, 0.0767, 0.0755, 0.0692, 0.0745, 0.075]],
        'NDCG@10': [[0.0615, 0.0624, 0.0593, 0.0606, 0.0584, 0.0608, 0.0622, 0.0596, 0.0596, 0.0606],
                     [0.0601, 0.0610, 0.0611, 0.0602, 0.0613, 0.062, 0.0599, 0.0549, 0.0608, 0.0596]],
    },
    'Beauty': {
        'HR@5':     [[0.0658, 0.0592, 0.0559, 0.0588, 0.0610, 0.0611, 0.0605, 0.0542, 0.0629, 0.0613],
                    [0.0559, 0.0529, 0.058, 0.0547, 0.0598, 0.0568, 0.0554, 0.0605, 0.0511, 0.0537]],
        'NDCG@5':  [[0.0505, 0.0468, 0.0419, 0.0445, 0.0461, 0.0474, 0.0468, 0.0400, 0.0493, 0.0477],
                    [0.0429, 0.0391, 0.044, 0.0445, 0.0462, 0.0438, 0.0412, 0.0465, 0.0411, 0.0395]],
        'HR@10':    [[0.0820, 0.0721, 0.0689, 0.0719, 0.0767, 0.0753, 0.0755, 0.0681, 0.0774, 0.0749],
                    [0.0696, 0.0673, 0.0724, 0.0702, 0.0742, 0.0698, 0.0715, 0.0728, 0.0593, 0.0688]],
        'NDCG@10': [[0.0555, 0.0508, 0.0458, 0.0484, 0.0509, 0.0515, 0.0514, 0.0444, 0.0538, 0.0519],
                    [0.0471, 0.0435, 0.0486, 0.0483, 0.0509, 0.0479, 0.0463, 0.0505, 0.0435, 0.0443]],
    },
    'Sports': {
        'R@5':     [[0.0506, 0.0484, 0.0538, 0.0515, 0.052, 0.0527, 0.0522, 0.0436, 0.0474, 0.0523],
                    [0.0499, 0.0495, 0.0478, 0.0487, 0.0496, 0.0506, 0.0511, 0.0534, 0.0519, 0.0463]],
        'NDCG@5':  [[0.0412, 0.0395, 0.0467, 0.0403, 0.0422, 0.0427, 0.0426, 0.0336, 0.0371, 0.0419],
                    [0.0399, 0.0390, 0.0376, 0.0386, 0.0396, 0.0411, 0.0411, 0.0435, 0.0423, 0.0364]],
        'R@10':    [[0.0613, 0.0584, 0.0664, 0.0594, 0.0608, 0.0615, 0.0609, 0.0506, 0.0561, 0.0605],
                    [0.0589, 0.0562, 0.0581, 0.0561, 0.0576, 0.0587, 0.0593, 0.0607, 0.0594, 0.0544]],
        'NDCG@10': [[0.0435, 0.0417, 0.0486, 0.0428, 0.0448, 0.0453, 0.0449, 0.0375, 0.0397, 0.044],
                    [0.0422, 0.0416, 0.04, 0.0407, 0.0419, 0.0434, 0.0435, 0.0456, 0.0444, 0.0389]],
    },
}

for t in topn_data_sample:
    print('>>>>>>>>>>>>>>>>>>>', t)
    for t1 in topn_data_sample[t]:
        if len(topn_data_sample[t][t1]) == 2:

            data = topn_data_sample[t][t1]

            if len(data[0]) == 0 or len(data[1]) == 0: continue

            t_statistic, p_value = ttest_ind(data[0], data[1])

            alpha = 0.05

            # print('avg', np.mean(data[0]), np.mean(data[1]), 'imp', (np.mean(data[0])-np.mean(data[1]))/np.mean(data[1]))

            if p_value < alpha:
                print('>>>>>>>', t1, p_value, 'significant')
                # print("Reject null hypothesis: There is a significant difference between the means.")
            else:
                print('>>>>>>>', t1, p_value, 'not significant')
                # print("Fail to reject null hypothesis: There is no significant difference between the means.")