Implementing 2-PL IRT Module in Python from scratch

Question

I am trying to implement a 2-PL dichotomous IRT Model for my dataset from scratch in Python. Here is my code so far:


def get_data_matrix(num_students, num_items):
    data = np.random.randint(0, 2, size=(num_students, num_items))
    return data



# Define the logistic function
def logistic_func(theta, b, a):
    return 1 / (1 + np.exp(-a * (theta - b)))


# Define the log-likelihood function for the 2PL model
def log_likelihood(params, response_matrix):
    num_students, num_questions = response_matrix.shape
    theta = params[:num_students]
    b = params[num_students:num_students + num_questions]
    a = params[num_students + num_questions:]

    ll = 0
    for i in range(num_students):
        for j in range(num_questions):
            ll += response_matrix[i, j] * np.log(logistic_func(theta[i], b[j], a[j]) + 1e-9)
            ll += (1 - response_matrix[i, j]) * np.log(1 - logistic_func(theta[i], b[j], a[j]) + 1e-9)

    return -ll

# Define the CMLE function for estimating theta
def cmle(response_matrix, b_estimated, a_estimated):
    num_students, num_items = response_matrix.shape

    def objective_function(theta):
        ll = 0
        for i in range(num_students):
           for j in range(num_items):
                ll += response_matrix[i, j] * np.log(logistic_func(theta[i], b_estimated[j], a_estimated[j]) + 1e-9)
                ll += (1 - response_matrix[i, j]) * np.log(1 - logistic_func(theta[i], b_estimated[j], a_estimated[j]) + 1e-9)

        return -ll

    initial_theta = np.zeros(num_students)  # Initial theta values
    bounds = [(-3, 3)] * num_students  # Bounds for theta estimation

    result = minimize(objective_function, initial_theta, bounds=bounds, method='L-BFGS-B')

    theta_estimated = result.x

    return theta_estimated




if __name__ == "__main__":

    # Generate example data
    num_students = 400
    num_questions = 20
    
    # Assuming you have responses and design_matrix as your data inputs
    # This is a random dataset -- will be replaced by the original data
    data = get_matrix(num_students, num_questions)

    initial_theta = np.zeros(num_students)
    initial_b = np.zeros(num_questions)
    initial_a = np.ones(num_questions)
    initial_params = np.concatenate((initial_theta, initial_b, initial_a))

    # Define the bounds for the parameters
    bounds = [(-3, 3)] * num_students + [(-3, 3)] * num_questions + [(-3, 3)] * num_questions

    # Perform maximum likelihood estimation
    result = minimize(log_likelihood, initial_params, args=(data,), bounds=bounds, method='L-BFGS-B')


    # Extract the estimated parameters
    estimated_b = result.x[num_students:num_students + num_questions]
    estimated_a = result.x[num_students + num_questions:]

    # Fit a CMLE again to estimate theta values again
    estimated_theta = cmle(data, estimated_b, estimated_a)

    # Compute the infit and outfit values
    data_succ_proportions = np.sum(data, axis=0)/num_students

    predicted_data = logistic_func(estimated_theta.reshape(-1, 1), estimated_b, estimated_a)
    # Compute the information matrix
    information_matrix = np.zeros_like(data, dtype=float)
    for i in range(num_questions):
        for j in range(num_students):
            p = predicted_data[j][i]
            information_matrix[j][i] = p * (1 - p)

    outfit = np.sum((data - predicted_data) ** 2/ information_matrix, axis=0) / (num_students)
    infit = np.sum((data - predicted_data) ** 2, axis=0) / np.sum(information_matrix, axis=0)



    # Compute the range of theta values
    theta_range = np.linspace(-4, 4, num=100)

    # Plot the ICC for each item
    for j in range(num_questions):
        p = logistic_func(theta_range, estimated_b[j], estimated_a[j])
        print("Item " + str(j) + ":" + str(estimated_b[j]) + " " + str(estimated_a[j]))
        plt.plot(theta_range, p, label=f"Item {j + 1}")


    plt.xlabel("Theta")
    plt.ylabel("Probability")
    plt.title("Item Characteristic Curves (ICC)")
    # Legend customization
    legend = plt.legend(fontsize='x-small')
    legend.get_frame().set_linewidth(0.5)
    legend.get_frame().set_edgecolor('black')
    legend.get_frame().set_alpha(0.7)
    plt.show()

    # Print the infit and outfit values
    print("Data success:")
    for item_idx, value in enumerate(data_succ_proportions):
        print(f"Item {item_idx + 1}: {value}")

    print("Infit:")
    for item_idx, value in enumerate(infit):
        print(f"Item {item_idx + 1}: {value}")

    print("\nOutfit:")
    for item_idx, value in enumerate(outfit):
        print(f"Item {item_idx + 1}: {value}")

However, when I use a in-built library in a software like Jamovi to compute the same 2-PL IRT model on my dataset, I get a very different set of b_estimates, a_estimates and even InFit and OutFit values. For each item, the b_estimates at least exhibit similar trends but the infit and outfit values do not even show the same trends.

My questions are the following:

Am I implementing 2-PL dichotomous IRT Model correctly and computing the infit and outfit stats correctly?
On JAMOVI (snowirt module) they say the Marginal Maximum Likelihood estimate was used to obtain the result tables. Is my implementation also for Marginal Maximum Likelihood estimate?

Implementing 2-PL IRT Module in Python from scratch

0 Answers0