CompFE/GenerateSubsets.py at main · benjaminfuller/CompFE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import math
import time
from joblib import Parallel, delayed
import os
import sys
import glob
import re
import random
from matplotlib import pyplot as plt
import numpy as np
import multiprocessing as mp
import pickle
#np.random.seed(1337) # for reproducibility`

################################################################################
#                      FUNCTION DEFINITIONS                                    #
################################################################################

def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

# Returns a numpy array of python arrays each chosen randomly with size number_samples
def sample_uniform(size, biometric_len, number_samples=1, confidence=None):
    pick_range = range(0, biometric_len - 1)
    randGen = random.SystemRandom()
    return np.array([randGen.sample(pick_range, size) for x in range(number_samples)])

def min_entropy(val):
    return min(- math.log2(val), - math.log2(1-val))

def binary_entropy(val):
    return -(val) * math.log2(val) - (1 - val) * (math.log2(1 - val))

def read_complex_conf(filepath):
        bad_list = []
        with open(filepath, 'r') as f:
            confidence = []
            lines = f.readlines()
            for line in lines:
                numbers = line[42:].strip()
                numbers_list = numbers.split(' ')
                predictability = (1 - float(numbers_list[2]))
                entropy = float(numbers_list[3])
                pair = [predictability, entropy]
                # print(numbers_list, "numbers list")
                if int(numbers_list[0]) in bad_list:
                    confidence.append([0,0.000000000000001])
                else:
                    confidence.append(pair)
            return confidence, bad_list

def gen(template,positions):
    ret_value = []
    for x in range(positions.shape[0]):
        v_i = template[positions[x]]
        ret_value.append(v_i)
    return ret_value

def sample_alpha(size, biometric_len, number_samples, confidence, alpha_param):
    bad_list = [28, 200, 503, 754]
    if confidence is None:
        print("Can't run Smart sampling without confidence, calling uniform")
        return sample_uniform(size, biometric_len, number_samples, confidence)

    sample_array = []
    new_confidence = [pair[0] ** alpha_param for pair in confidence]

    for set_selection_iter in range(number_samples):
        sample_indices = random.choices(range(len(new_confidence)), weights=new_confidence, k=size)
        dedup_indices = list(set(sample_indices))
        loop_count = 1
        while len(dedup_indices) < size:
            new_index = random.choices(range(len(new_confidence)), weights=new_confidence, k=1)
            sample_indices = dedup_indices
            sample_indices.extend(new_index)
            dedup_indices = []
            [dedup_indices.append(n) for n in sample_indices if n not in dedup_indices and n not in bad_list]
            loop_count = loop_count +1
            if loop_count == 1000000:
                print("Smart sampling failed to find a non-duplicating subset")
                exit(1)
        sample_array.append(dedup_indices)
    return np.array(sample_array)

def sample_alpha_with_entropy(size, biometric_len, number_samples, confidence, alpha_param):
    bad_list = [28, 200, 754]
    if confidence is None:
        print("Can't run Smart sampling without confidence, calling uniform")
        return sample_uniform(size, biometric_len, number_samples, confidence)

    sample_array = []
    new_confidence = [(pair[0]/max(pair[1], 1-pair[1])) ** (alpha_param ) for pair in confidence]

    for set_selection_iter in range(number_samples):
        sample_indices = random.choices(range(len(new_confidence)), weights=new_confidence, k=size)
        sample_indices = [index for index in sample_indices if index not in bad_list]
        dedup_indices = list(set(sample_indices))
        loop_count = 1
        while len(dedup_indices) < size:
            new_index = random.choices(range(len(new_confidence)), weights=new_confidence, k=max(1,size - len(dedup_indices)))
            [dedup_indices.append(n) for n in new_index if n not in dedup_indices and n not in bad_list]
            loop_count = loop_count +1
            if loop_count == 1000000:
                print("Smart sampling failed to find a non-duplicating subset")
                exit(1)
        sample_array.append(dedup_indices)
    return np.array(sample_array)

################################################################################
#                    EXECUTION SCRIPT                                          #
################################################################################

# Command Line Usage:
# python3 GenerateSubsets.py [subset size] ['simple' or 'complex'] [alpha] [number of subsets] [output file name]

size_or_threshold = int(sys.argv[1]) # Subset size
selection_method = sys.argv[2] # 'complex' or 'simple'
alpha_param = float(sys.argv[3]) # Confidence Weight Parameter
num_lockers = int(sys.argv[4]) # number of subsets sampled
outputfilename = sys.argv[5] + str(size_or_threshold) +  str(selection_method) + str(alpha_param)  + '_' + str(num_lockers) #output file name
numbers = re.compile(r'(\d+)')
cwd = os.getcwd()
num_cpus = mp.cpu_count()
folder_list = sorted(glob.glob(cwd + "<Enter your feature vector folder here>"),key=numericalSort)
print (cwd)
print ("Folders: ",len(folder_list))
num_classes = range(len(folder_list))

print("Reading Confidence")
confidence, bad_list = read_complex_conf(cwd + "/PythonImpl/AuxiliaryFiles/confidence_672_70classes.txt")

if selection_method == 'complex':
    print("Generating Subsets Using Complex Alpha Sampling")
    positions = sample_alpha_with_entropy(size_or_threshold,1024,num_lockers,confidence,alpha_param)
elif selection_method == 'simple':
    print("Generating Subsets Using Simple Alpha Sampling")
    positions = sample_alpha(size_or_threshold,1024,num_lockers,confidence,alpha_param)

with open(outputfilename + ".pkl",'wb') as f:
    f.write(pickle.dumps(positions))
    f.close()

print("Generated " + str(len(positions)) + " subsets of length " + str(len(positions[0])))
print("Finished Generating Subsets")