1

I am trying to accomplish a weird task. I need to complete the following without the use of sklearn, and preferably with numpy:

  1. Given a dataset, split the data into 5 equal "folds", or partitions
  2. Within each partition, split the data into a "training" and "testing" set, with an 80/20 split
  3. Here is the catch: Your dataset is labeled for classes. So take for example a dataset with 100 instances, and class A with 33 samples and class B with 67 samples. I should create 5 folds of 20 data instances, where in each fold, class A has something like 6 or 7 (1/3) values and class B has the rest

My issue that: I do not know how to properly return a test and training set for each fold, despite being able to split it appropriately, and, more important, I do not know how to incorporate the proper division of # of elements per class.

My current code is here. It is commented where I am stuck:

import numpy

def csv_to_array(file):
    # Open the file, and load it in delimiting on the ',' for a comma separated value file
    data = open(file, 'r')
    data = numpy.loadtxt(data, delimiter=',')

    # Loop through the data in the array
    for index in range(len(data)):
        # Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
        try:
            data[index] = [float(x) for x in data[index]]
        except Exception:
            data[index] = 0
        except ValueError:
            data[index] = 0

    # Return the now type-formatted data
    return data

def five_cross_fold_validation(dataset):
    # print("DATASET", dataset)
    numpy.random.shuffle(dataset)
    num_rows = dataset.shape[0]
    split_mark = int(num_rows / 5)
    folds = []
    temp1 = dataset[:split_mark]
    # print("TEMP1", temp1)
    temp2 = dataset[split_mark:split_mark*2]
    # print("TEMP2", temp2)
    temp3 = dataset[split_mark*2:split_mark*3]
    # print("TEMP3", temp3)
    temp4 = dataset[split_mark*3:split_mark*4]
    # print("TEMP4", temp4)
    temp5 = dataset[split_mark*4:]
    # print("TEMP5", temp5)
    folds.append(temp1)
    folds.append(temp2)
    folds.append(temp3)
    folds.append(temp4)
    folds.append(temp5)
    # folds = numpy.asarray(folds)

    for fold in folds:
        # fold = numpy.asarray(fold)
        num_rows = fold.shape[0]
        split_mark = int(num_rows * .8)

        fold_training = fold[split_mark:]
        fold_testing = fold[:split_mark]

        print(type(fold))
        # fold.tolist()
        list(fold)
        print(type(fold))
        del fold[0:len(fold)]
        fold.append(fold_training)
        fold.append(fold_testing)
        fold = numpy.asarray(fold)




        # Somehow, return a testing and training set within each fold

    # print(folds)

    return folds

def confirm_size(folds):
    total = 0
    for fold in folds:
        curr = len(fold)
        total = total + curr
    return total


def main():
    print("BEGINNING CFV")
    ecoli = csv_to_array('Classification/ecoli.csv')
    print(len(ecoli))
    folds = five_cross_fold_validation(ecoli)
    size = confirm_size(folds)
    print(size)

main()

Additionally, for reference, I have attached my csv I am working with (it is a modification of the UCI Ecoli Dataset.) The classes here are the values in the last column. So 0, 1, 2, 3, 4. It is important to note that there are not equal amounts of each class.

        0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
        0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
        0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
        0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
        0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
        0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
        0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
        0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
        0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
        0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
        0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
        0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
        0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
        0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
        0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
        0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
        0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
        0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
        0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
        0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
        0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
        0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
        0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
        0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
        0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
        0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
        0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
        0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
        0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
        0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
        0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
        0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
        0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
        0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
        0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
        0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
        0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
        0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
        0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
        0,0.38,0.48,0.5,0.42,0.48,0.55,0
        0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
        0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
        0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
        0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
        0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
        0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
        0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
        0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
        0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
        0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
        0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
        0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
        0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
        0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
        0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
        0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
        0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
        0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
        0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
        0.27,0.35,0.48,0.5,0.51,0.77,0.79,1
artemis
  • 6,857
  • 11
  • 46
  • 99
  • Do you need the ratio to be exact (± 1) or do you need an expected ratio of |A|/|B|? – cglacet Mar 09 '19 at 01:03
  • 1
    Also how do you distinguish A from B in the CSV? – cglacet Mar 09 '19 at 01:13
  • Sorry for delay. Didnt hear the notification. The last column of the csv I pasted in contains the "classes", i.e., 0 1 2 3 4. I will make that edit. – artemis Mar 09 '19 at 01:22
  • And I'm sorry that I dont follow the first part. – artemis Mar 09 '19 at 01:23
  • The first part is actually important, because if you just need to have samples that are statistically representative of your input distribution of classes, then you can just pick 20% of the rows at random. The expected proportion of classes in the samples will be the same as in the input. On the other hand, if you need to have exactly the same proportions in the output, then you'll have to pick a random sample of size 0.2*(class size/total sample size) from each class. – cglacet Mar 09 '19 at 01:33
  • I guess it is the terminology that is confusing me but I still dont understand what is being asked and i am sorry because i dont want to turn away help. In short, I need to determine the % of the given dataset belongs to each class. So if I have classes 0-4 (5 classes), I need to determine how much of the dataset (in %), each class is responsible for. Thus, when I break my dataset into k folds (5 for example), each fold has the same ratio of classes in it – artemis Mar 09 '19 at 01:37
  • Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/189710/discussion-between-cglacet-and-jerry-m). – cglacet Mar 09 '19 at 01:38

1 Answers1

1

Edit I replaced np.random.shuffle(A) by A = np.random.permutation(A), the only difference is that it doesn't mutate the input array. This doesn't make any difference in this code, but it is safer in general.

The idea is to randomly sample the input by using numpy.random.permutation. Once the rows are shuffled we just need to iterate over all the possible tests sets (sliding window of the desired size, here 20% of the input size). The corresponding training sets are just composed of all remaining elements.

This will preserve the original classes distribution on all subsets even though we pick them in order because we shuffled the input.

The following code iterate over the test/train sets combinations:

import numpy as np

def csv_to_array(file):
  with open(file, 'r') as f:
    data = np.loadtxt(f, delimiter=',')
  return data

def classes_distribution(A):
  """Print the class distributions of array A."""
  nb_classes = np.unique(A[:,-1]).shape[0]
  total_size = A.shape[0]
  for i in range(nb_classes):
    class_size = sum(row[-1] == i for row in A)
    class_p = class_size/total_size
    print(f"\t P(class_{i}) = {class_p:.3f}")

def random_samples(A, test_set_p=0.2):
  """Split the input array A in two uniformly chosen 
  random sets: test/training.
  Repeat this until all rows have been yielded once at least 
  once as a test set."""
  A = np.random.permutation(A)
  sample_size = int(test_set_p*A.shape[0])
  for start in range(0, A.shape[0], sample_size):
    end = start + sample_size
    yield {
      "test": A[start:end,], 
      "train": np.append(A[:start,], A[end:,], 0)
    }

def main():
  ecoli = csv_to_array('ecoli.csv')
  print("Input set shape: ", ecoli.shape)
  print("Input set class distribution:")
  classes_distribution(ecoli)
  print("Training sets class distributions:")
  for iteration in random_samples(ecoli):
    test_set = iteration["test"]
    training_set = iteration["train"]
    classes_distribution(training_set)
    print("---")
    # ... Do what ever with these two sets

main()

It produces an output of the form:

Input set shape:  (169, 8)
Input set class distribution:
     P(class_0) = 0.308
     P(class_1) = 0.213
     P(class_2) = 0.207
     P(class_3) = 0.118
     P(class_4) = 0.154
Training sets class distributions:
     P(class_0) = 0.316
     P(class_1) = 0.206
     P(class_2) = 0.199
     P(class_3) = 0.118
     P(class_4) = 0.162
...
cglacet
  • 8,873
  • 4
  • 45
  • 60