0

I am trying to implement a 2-PL dichotomous IRT Model for my dataset from scratch in Python. Here is my code so far:


def get_data_matrix(num_students, num_items):
    data = np.random.randint(0, 2, size=(num_students, num_items))
    return data



# Define the logistic function
def logistic_func(theta, b, a):
    return 1 / (1 + np.exp(-a * (theta - b)))


# Define the log-likelihood function for the 2PL model
def log_likelihood(params, response_matrix):
    num_students, num_questions = response_matrix.shape
    theta = params[:num_students]
    b = params[num_students:num_students + num_questions]
    a = params[num_students + num_questions:]

    ll = 0
    for i in range(num_students):
        for j in range(num_questions):
            ll += response_matrix[i, j] * np.log(logistic_func(theta[i], b[j], a[j]) + 1e-9)
            ll += (1 - response_matrix[i, j]) * np.log(1 - logistic_func(theta[i], b[j], a[j]) + 1e-9)

    return -ll

# Define the CMLE function for estimating theta
def cmle(response_matrix, b_estimated, a_estimated):
    num_students, num_items = response_matrix.shape

    def objective_function(theta):
        ll = 0
        for i in range(num_students):
           for j in range(num_items):
                ll += response_matrix[i, j] * np.log(logistic_func(theta[i], b_estimated[j], a_estimated[j]) + 1e-9)
                ll += (1 - response_matrix[i, j]) * np.log(1 - logistic_func(theta[i], b_estimated[j], a_estimated[j]) + 1e-9)

        return -ll

    initial_theta = np.zeros(num_students)  # Initial theta values
    bounds = [(-3, 3)] * num_students  # Bounds for theta estimation

    result = minimize(objective_function, initial_theta, bounds=bounds, method='L-BFGS-B')

    theta_estimated = result.x

    return theta_estimated




if __name__ == "__main__":

    # Generate example data
    num_students = 400
    num_questions = 20
    
    # Assuming you have responses and design_matrix as your data inputs
    # This is a random dataset -- will be replaced by the original data
    data = get_matrix(num_students, num_questions)

    initial_theta = np.zeros(num_students)
    initial_b = np.zeros(num_questions)
    initial_a = np.ones(num_questions)
    initial_params = np.concatenate((initial_theta, initial_b, initial_a))

    # Define the bounds for the parameters
    bounds = [(-3, 3)] * num_students + [(-3, 3)] * num_questions + [(-3, 3)] * num_questions

    # Perform maximum likelihood estimation
    result = minimize(log_likelihood, initial_params, args=(data,), bounds=bounds, method='L-BFGS-B')


    # Extract the estimated parameters
    estimated_b = result.x[num_students:num_students + num_questions]
    estimated_a = result.x[num_students + num_questions:]

    # Fit a CMLE again to estimate theta values again
    estimated_theta = cmle(data, estimated_b, estimated_a)

    # Compute the infit and outfit values
    data_succ_proportions = np.sum(data, axis=0)/num_students

    predicted_data = logistic_func(estimated_theta.reshape(-1, 1), estimated_b, estimated_a)
    # Compute the information matrix
    information_matrix = np.zeros_like(data, dtype=float)
    for i in range(num_questions):
        for j in range(num_students):
            p = predicted_data[j][i]
            information_matrix[j][i] = p * (1 - p)

    outfit = np.sum((data - predicted_data) ** 2/ information_matrix, axis=0) / (num_students)
    infit = np.sum((data - predicted_data) ** 2, axis=0) / np.sum(information_matrix, axis=0)



    # Compute the range of theta values
    theta_range = np.linspace(-4, 4, num=100)

    # Plot the ICC for each item
    for j in range(num_questions):
        p = logistic_func(theta_range, estimated_b[j], estimated_a[j])
        print("Item " + str(j) + ":" + str(estimated_b[j]) + " " + str(estimated_a[j]))
        plt.plot(theta_range, p, label=f"Item {j + 1}")


    plt.xlabel("Theta")
    plt.ylabel("Probability")
    plt.title("Item Characteristic Curves (ICC)")
    # Legend customization
    legend = plt.legend(fontsize='x-small')
    legend.get_frame().set_linewidth(0.5)
    legend.get_frame().set_edgecolor('black')
    legend.get_frame().set_alpha(0.7)
    plt.show()

    # Print the infit and outfit values
    print("Data success:")
    for item_idx, value in enumerate(data_succ_proportions):
        print(f"Item {item_idx + 1}: {value}")

    print("Infit:")
    for item_idx, value in enumerate(infit):
        print(f"Item {item_idx + 1}: {value}")

    print("\nOutfit:")
    for item_idx, value in enumerate(outfit):
        print(f"Item {item_idx + 1}: {value}")

However, when I use a in-built library in a software like Jamovi to compute the same 2-PL IRT model on my dataset, I get a very different set of b_estimates, a_estimates and even InFit and OutFit values. For each item, the b_estimates at least exhibit similar trends but the infit and outfit values do not even show the same trends.

My questions are the following:

  • Am I implementing 2-PL dichotomous IRT Model correctly and computing the infit and outfit stats correctly?
  • On JAMOVI (snowirt module) they say the Marginal Maximum Likelihood estimate was used to obtain the result tables. Is my implementation also for Marginal Maximum Likelihood estimate?
204
  • 433
  • 1
  • 5
  • 19

0 Answers0