I have asked a question about converting a pile of RDF-triples to multiway array representation. My solution looks unwieldy and ...ugly. I try to use function in my solution but I find it is unnecessary since the advantages of using function are not obvious. I think perhaps my programming thinking is procedure-oriented, not OOP.
Here is my solution: (This is the original version without any function)
import numpy as np
from numpy import ones
from scipy.sparse import coo_matrix
s = []
p = []
o = []
# format of each line in 'test.txt' is (subject, predicate, object)
# extract sub, pre and obj in each line then stored in s, p, o, respectively.
for row in open('test.txt'):
s.append(row.split('(')[1].split(',')[0])
p.append(row.split('(')[1].split(', ')[1])
o.append(row.split(')')[0].split(', ')[2])
# create mapping from indexes to entities
l = s + o
mapping = { v:i for (i, v) in enumerate(sorted(set(l))) }
mapping_p = { v:i for (i, v) in enumerate(sorted(set(p))) }
# get lists contain [indexes, entities] from initial lists s, p, o
n_s = [ [mapping[v], v] for v in s ]
n_p = [ [mapping_p[v], v] for v in p ]
n_o = [ [mapping[v], v] for v in o ]
# create the coodinate of each list
cood_s = []
cood_p = []
cood_o = []
# get the index of s[i], p[i], o[i] and stored in cood_s, cood_p, cood_o, respectively
for i in range(len(n_s)):
cood_s.append(n_s[i][0])
cood_p.append(n_p[i][0])
cood_o.append(n_o[i][0])
cood = zip(cood_s, cood_o, cood_p)
data = [ list(i) for i in cood ]
m = len(set(p)) # m is the number of predicates
data1 = []
# data1: a list contains ndarrays, each of which records the coodinate of the entity-to-entity matrix
for i in range(m):
data2 = []
for j in range(len(data)):
if data[j][2] == i: data2.append(data[j])
data1.append(np.array(data2))
T = []
n = len(set(l)) # n is the number of entities
# generates the coordinate matrix of each predicate as a ndarray then appends to T
for i in range(m):
T.append( np.array(coo_matrix((ones(len(data1[i])), (data1[i][:, 0], data1[i][:, 1])), shape = (n, n)).todense() ))
OK, I hope you're not getting bored, then I use functions to join some statements together, as follows:
def split_data_matrix(filename):
s = []
p = []
o = []
l = []
for row in open(filename):
s.append(row.split('"')[1])
o.append(row.split('"')[3])
p.append(row.split('(')[0])
l = s + o
return s, o, p, l
s, o, p, l = split_data_matrix('test.txt')
def generate_mapping(l, p):
mapping = { v:i for (i, v) in enumerate(sorted(set(l))) }
mapping_p = { v:i for (i, v) in enumerate(sorted(set(p))) }
n_s = [ [mapping[v], v] for v in s ]
n_p = [ [mapping_p[v], v] for v in p ]
n_o = [ [mapping[v], v] for v in o ]
return n_s, n_p, n_o
n_s, n_p, n_o = generate_mapping(l, p)
def generate_index(n_s, n_p, n_o):
cood_s = []
cood_p = []
cood_o = []
for i in range(len(n_s)):
cood_s.append(n_s[i][0])
cood_p.append(n_p[i][0])
cood_o.append(n_o[i][0])
cood = zip(cood_s, cood_o, cood_p)
data = [ list(i) for i in cood ]
return data
data = generate_index(n_s, n_p, n_o)
def generate_coordinate(data):
data1 = []
m = len(set(p))
for i in range(m):
data2 = []
for j in range(len(data)):
if data[j][2] == i: data2.append(data[j])
data1.append(np.array(data2))
return data1
data1 = generate_coordinate(data)
def generate_ndarrays(data1):
T = []
n = len(set(l))
m = len(set(p))
for i in range(m):
T.append( np.array(coo_matrix((ones(len(data1[i])), (data1[i][:, 0], data1[i][:, 1])), shape = (n, n)).todense() ))
return T
T = generate_ndarrays(data1)
My function generation process is simple: I just join some functional similarity lines together then define a function on it. I don't quite sure about whether my way is reasonable. Can someone tell me how to deal with it? Any comment would be welcomed.