I am trying to implement a 2-PL dichotomous IRT Model for my dataset from scratch in Python. Here is my code so far:
def get_data_matrix(num_students, num_items):
data = np.random.randint(0, 2, size=(num_students, num_items))
return data
# Define the logistic function
def logistic_func(theta, b, a):
return 1 / (1 + np.exp(-a * (theta - b)))
# Define the log-likelihood function for the 2PL model
def log_likelihood(params, response_matrix):
num_students, num_questions = response_matrix.shape
theta = params[:num_students]
b = params[num_students:num_students + num_questions]
a = params[num_students + num_questions:]
ll = 0
for i in range(num_students):
for j in range(num_questions):
ll += response_matrix[i, j] * np.log(logistic_func(theta[i], b[j], a[j]) + 1e-9)
ll += (1 - response_matrix[i, j]) * np.log(1 - logistic_func(theta[i], b[j], a[j]) + 1e-9)
return -ll
# Define the CMLE function for estimating theta
def cmle(response_matrix, b_estimated, a_estimated):
num_students, num_items = response_matrix.shape
def objective_function(theta):
ll = 0
for i in range(num_students):
for j in range(num_items):
ll += response_matrix[i, j] * np.log(logistic_func(theta[i], b_estimated[j], a_estimated[j]) + 1e-9)
ll += (1 - response_matrix[i, j]) * np.log(1 - logistic_func(theta[i], b_estimated[j], a_estimated[j]) + 1e-9)
return -ll
initial_theta = np.zeros(num_students) # Initial theta values
bounds = [(-3, 3)] * num_students # Bounds for theta estimation
result = minimize(objective_function, initial_theta, bounds=bounds, method='L-BFGS-B')
theta_estimated = result.x
return theta_estimated
if __name__ == "__main__":
# Generate example data
num_students = 400
num_questions = 20
# Assuming you have responses and design_matrix as your data inputs
# This is a random dataset -- will be replaced by the original data
data = get_matrix(num_students, num_questions)
initial_theta = np.zeros(num_students)
initial_b = np.zeros(num_questions)
initial_a = np.ones(num_questions)
initial_params = np.concatenate((initial_theta, initial_b, initial_a))
# Define the bounds for the parameters
bounds = [(-3, 3)] * num_students + [(-3, 3)] * num_questions + [(-3, 3)] * num_questions
# Perform maximum likelihood estimation
result = minimize(log_likelihood, initial_params, args=(data,), bounds=bounds, method='L-BFGS-B')
# Extract the estimated parameters
estimated_b = result.x[num_students:num_students + num_questions]
estimated_a = result.x[num_students + num_questions:]
# Fit a CMLE again to estimate theta values again
estimated_theta = cmle(data, estimated_b, estimated_a)
# Compute the infit and outfit values
data_succ_proportions = np.sum(data, axis=0)/num_students
predicted_data = logistic_func(estimated_theta.reshape(-1, 1), estimated_b, estimated_a)
# Compute the information matrix
information_matrix = np.zeros_like(data, dtype=float)
for i in range(num_questions):
for j in range(num_students):
p = predicted_data[j][i]
information_matrix[j][i] = p * (1 - p)
outfit = np.sum((data - predicted_data) ** 2/ information_matrix, axis=0) / (num_students)
infit = np.sum((data - predicted_data) ** 2, axis=0) / np.sum(information_matrix, axis=0)
# Compute the range of theta values
theta_range = np.linspace(-4, 4, num=100)
# Plot the ICC for each item
for j in range(num_questions):
p = logistic_func(theta_range, estimated_b[j], estimated_a[j])
print("Item " + str(j) + ":" + str(estimated_b[j]) + " " + str(estimated_a[j]))
plt.plot(theta_range, p, label=f"Item {j + 1}")
plt.xlabel("Theta")
plt.ylabel("Probability")
plt.title("Item Characteristic Curves (ICC)")
# Legend customization
legend = plt.legend(fontsize='x-small')
legend.get_frame().set_linewidth(0.5)
legend.get_frame().set_edgecolor('black')
legend.get_frame().set_alpha(0.7)
plt.show()
# Print the infit and outfit values
print("Data success:")
for item_idx, value in enumerate(data_succ_proportions):
print(f"Item {item_idx + 1}: {value}")
print("Infit:")
for item_idx, value in enumerate(infit):
print(f"Item {item_idx + 1}: {value}")
print("\nOutfit:")
for item_idx, value in enumerate(outfit):
print(f"Item {item_idx + 1}: {value}")
However, when I use a in-built library in a software like Jamovi to compute the same 2-PL IRT model on my dataset, I get a very different set of b_estimates, a_estimates and even InFit and OutFit values. For each item, the b_estimates at least exhibit similar trends but the infit and outfit values do not even show the same trends.
My questions are the following:
- Am I implementing 2-PL dichotomous IRT Model correctly and computing the infit and outfit stats correctly?
- On JAMOVI (snowirt module) they say the Marginal Maximum Likelihood estimate was used to obtain the result tables. Is my implementation also for Marginal Maximum Likelihood estimate?