I made a genetic algorithm to solve the Set Covering Problem (SCP) which consists of given a coverage matrix that tells us which columns cover which rows and what is the cost of each column to find the set of columns that cover all the rows at the lowest possible cost.
I've commented out my code to make it as clear as possible what I'm doing and what each function does.
I tested for the scp41.txt dataset from Beasley's OR library (http://people.brunel.ac.uk/~mastjjb/jeb/orlib/files/). The optimal solution for that data set is 429. However, my algorithm returns costs from 1200 to 1500 and stays there, from the first iteration it gives those values and does not improve.
# Reading part
import math
import numpy as np
import random
def leer_archivo2(archivo):
f = open(archivo, "r")
lines = f.read().splitlines()
# Dimesiones
m = int(lines[0].split()[0])
n = int(lines[0].split()[1])
# Costos
start_index = 1
end_index = math.ceil(n/12 + 1)
costos_str = " ".join(lines[start_index:end_index]).split()
costos = np.array([int(x) for x in costos_str])
# Matriz de cobertura
a_str = np.array(lines)[math.ceil(n/12 + 1) :][1::2]
cols_A = max(max(int(numero) for numero in fila.split()) for fila in a_str)+ 1
rows_A = len(a_str)
A = np.zeros((rows_A, cols_A), dtype=int)
for i, fila in enumerate(a_str):
numeros = fila.split()
for numero in numeros:
A[i, int(numero)] = 1
A = A[:, 1:]
return A, m, n, costos
A, m, n, costos = leer_archivo("scp41.txt")
# Functions of GA
# This function computes w, w is a vector
# that stores the number of columns that cover each row
def calcular_w(S, A):
m, n = A.shape
w = np.zeros(m)
for j in S:
b = [i for i, fila in enumerate(A) if fila[j] == 1]
for k in b:
w[k] = w[k] + 1
return w
# This function repairs solutions that are not feasible and makes them feasible
# #The objective of this function is to find columns that cover many rows that were left uncovered and that are also cheap.
def repara(S, A, costos):
w = calcular_w(S, A)
filas_no_cubiertas = np.where(w == 0)[0].tolist()
proporciones = []
# The following code block calculates the ratios: column cost / number of uncovered rows it covers
# and stores proportion of every column in proporciones[]
for j in range(n):
b = [i for i, fila in enumerate(A) if fila[j] == 1]
cant = len(set(b).intersection(set(filas_no_cubiertas)))
if(cant == 0):
proporcion = 0
else:
proporcion = costos[j]/cant
proporciones.append(proporcion)
# Once the proportion has been calculated, we find for each row the one with the smallest proportion
# since it would be the best column. When we find for a row i, we add to S and recalculate w
# to see which rows are still uncovered and so on until S is already feasible.
for i in range(m):
if(w[i] == 0):
a = [k for k, valor in enumerate(A[i]) if valor == 1]
proporciones_a = []
for k in a:
proporciones_a.append(proporciones[k])
columna_idonea = proporciones_a.index(min(proporciones_a))
S.append(a[columna_idonea])
w = calcular_w(S, A)
return S
# This is the initialize function
def inicializar(A, m, tam_poblacion):
poblacion = []
#In this for loop each row is traversed and a column that covers that row is randomly taken
# and added to S. In each iteration it checks if the row is already covered then it
# goes to the next one achieving a feasible solution
for i in range(tam_poblacion):
m, n = A.shape
S = []
w = np.zeros(m)
for i in range(m):
if(w[i] == 0):
a = np.where(A[i] == 1)[0]
j = np.random.choice(a)
S.append(j)
b = [i for i, fila in enumerate(A) if fila[j] == 1]
for k in b:
w[k] = w[k] + 1
else:
continue
# In these loops it is verified that rows are covered by more than 2 rows,
# and a column that covers said row is eliminated.
for i in range(len(w)):
if(w[i]>= 2):
for indice, j in enumerate(S):
b = [i for i, fila in enumerate(A) if fila[j] == 1]
if i in b:
S.pop(indice)
break
w = calcular_w(S, A)
# As some columns have been eliminated, some rows have been left uncovered,
# therefore the solution is repaired
S = repara(S, A, costos)
poblacion.append(S)
return poblacion
# Here the fitness is calculated, which is basically
# the sum of the cost of each column that has been taken as a solution.
def FO_fitness(poblacion):
F = []
for s in poblacion:
fitness = 0
for i in s:
fitness = fitness + costos[i]
F.append(fitness)
return F
# This is the selection method, a parent with higher fitness is less likely to be chosen,
# since higher fitness is higher cost and SCP is a minimization problem.
def seleccion(poblacion, fitness, tam_poblacion):
p = np.zeros(tam_poblacion)
den = 0
for i in range(tam_poblacion):
den = den + 1/fitness[i]
for i in range(tam_poblacion):
p[i] = (1/fitness[i])/den
padre, fitness = random.choices(list(zip(poblacion, p)))[0]
return padre, fitness
# In the crossover method, a child will take the value of one of its two parents,
# this with a higher probability for the parent with lower fitness.
def cruza(padre1, padre2, f1, f2):
if(padre1 == padre2):
hijo = padre1
else:
hijo = random.choices([padre1, padre2], [f2/(f1+f2), 1 - (f2/(f1+f2))])[0]
return hijo
# In the mutation method we simply change a number of columns given by the tasa_mutacion
# in a solution by other columns, the columns were restricted to non-repeating so they are not duplicated
def mutar_hijo(hijo, n, tasa_mutacion):
nuevo_hijo = hijo.copy()
num_mutaciones = int(tasa_mutacion * len(hijo))
indices_mutacion = random.sample(range(len(hijo)), num_mutaciones)
for indice in indices_mutacion:
nuevo_hijo[indice] = random.choice([x for x in range(n) if x not in hijo])
return nuevo_hijo
# Run GA
generaciones = 50
tam_poblacion = 110
poblacion = inicializar(A, m, tam_poblacion)
for i in range(generaciones):
F = FO_fitness(poblacion)
padre1, fit1 = seleccion(poblacion, F, tam_poblacion)
nueva_pop = [elemento for elemento in poblacion if elemento != padre1]
padre2, fit2 = seleccion(nueva_pop, F, tam_poblacion)
hijo = cruza(padre1, padre2, fit1, fit2)
hijo = mutar_hijo(hijo, n, 0.2)
hijo = repara(hijo, A, costos)
indice_lf = F.index(min(F))
poblacion[indice_lf] = hijo
optimo = min(FO_fitness(poblacion))
I don't expect it to give me the optimal one since the genetic algorithm does not guarantee it, but at least it improves, I don't see the reason why it is stagnant.
My algorithm is based on the paper: "Resolución del Problema de Set-Covering utilizando un Algoritmo Genético" by Pablo Itaim Ananias, 2005.