Apache Iceberg Sort order id not being respected in Spark

Question

Hi I have been seeing some unexpected behavior related to the sort ordering of a Iceberg table. The problem is that I set up SORT ORDER correctly such that the partitions are ordered. However, it seems from the data that it does not respect this setting. To showcase the problem I have created a small example:

import os

from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StructField, StructType

deps = [
    "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.2.1",
    "org.apache.iceberg:iceberg-aws:1.2.1",
    "software.amazon.awssdk:bundle:2.17.257",
    "software.amazon.awssdk:url-connection-client:2.17.257",
]
os.environ["PYSPARK_SUBMIT_ARGS"] = f"--packages {','.join(deps)} pyspark-shell"
os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"
os.environ["AWS_REGION"] = "eu-east-1"


catalog = "hive_catalog"
spark = (
    SparkSession.builder.appName("Iceberg Reader")
    .config(
        "spark.sql.extensions",
        "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    )
    .config(f"spark.sql.catalog.{catalog}", "org.apache.iceberg.spark.SparkCatalog")
    .config(f"spark.sql.catalog.{catalog}.type", "hive")
    .config(f"spark.sql.catalog.{catalog}.uri", "thrift://localhost:9083")
    .config(
        f"spark.sql.catalog.{catalog}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO"
    )
    .config(f"spark.sql.catalog.{catalog}.s3.endpoint", "http://localhost:9000")
    .config(f"spark.sql.catalog.{catalog}.warehouse", "s3a://lakehouse")
    .config("spark.sql.defaultCatalog", catalog)
    .config("hive.metastore.uris", "thrift://localhost:9083")
    .enableHiveSupport()
    .getOrCreate()
)

# Create namespace and table
spark.sql("CREATE NAMESPACE IF NOT EXISTS new_ns;")
spark.sql(
    "CREATE TABLE IF NOT EXISTS new_ns.new_table (first_id INT, second_id INT)"
    " PARTITIONED BY (second_id);"
)

# Add sort order
spark.sql("ALTER TABLE new_ns.new_table WRITE ORDERED BY first_id")

# Describe
spark.sql("DESCRIBE new_ns.new_table;").show()
#+--------------+---------+-------+
#|      col_name|data_type|comment|
#+--------------+---------+-------+
#|      first_id|      int|       |
#|     second_id|      int|       |
#|              |         |       |
#|# Partitioning|         |       |
#|        Part 0|second_id|       |
#+--------------+---------+-------+

This shows that the table has been created properly and the partitioning is defined correctly. Also inspecting the metadata.json:

{
  "format-version" : 1,
  "table-uuid" : "4a867367-6207-4e41-b8ea-45bbaf6ca4d0",
  "location" : "s3a://lakehouse/new_ns.db/new_table",
  "last-updated-ms" : 1683982109298,
  "last-column-id" : 2,
  "schema" : {
    "type" : "struct",
    "schema-id" : 0,
    "fields" : [ {
      "id" : 1,
      "name" : "first_id",
      "required" : false,
      "type" : "int"
    }, {
      "id" : 2,
      "name" : "second_id",
      "required" : false,
      "type" : "int"
    } ]
  },
  "current-schema-id" : 0,
  "schemas" : [ {
    "type" : "struct",
    "schema-id" : 0,
    "fields" : [ {
      "id" : 1,
      "name" : "first_id",
      "required" : false,
      "type" : "int"
    }, {
      "id" : 2,
      "name" : "second_id",
      "required" : false,
      "type" : "int"
    } ]
  } ],
  "partition-spec" : [ {
    "name" : "second_id",
    "transform" : "identity",
    "source-id" : 2,
    "field-id" : 1000
  } ],
  "default-spec-id" : 0,
  "partition-specs" : [ {
    "spec-id" : 0,
    "fields" : [ {
      "name" : "second_id",
      "transform" : "identity",
      "source-id" : 2,
      "field-id" : 1000
    } ]
  } ],
  "last-partition-id" : 1000,
  "default-sort-order-id" : 1,
  "sort-orders" : [ {
    "order-id" : 0,
    "fields" : [ ]
  }, {
    "order-id" : 1,
    "fields" : [ {
      "transform" : "identity",
      "source-id" : 1,
      "direction" : "asc",
      "null-order" : "nulls-first"
    } ]
  } ],
  "properties" : {
    "owner" : "thijsvandepoll",
    "write.distribution-mode" : "range"
  },
  "current-snapshot-id" : -1,
  "refs" : { },
  "snapshots" : [ ],
  "statistics" : [ ],
  "snapshot-log" : [ ],
  "metadata-log" : [ {
    "timestamp-ms" : 1683982096980,
    "metadata-file" : "s3a://lakehouse/new_ns.db/new_table/metadata/00000-e96e0798-ed81-4e2d-91b2-e4705067999d.metadata.json"
  } ]
}

It seems like the sort_order_id is also configured correctly. The default-sort-order-id is set to 1, which refers to the same ordering as the one defined in the SQL.

Now we continue and try to insert some data:

# Insert some data
spark.createDataFrame(
    [(1, 1), (2, 1), (3, 1)],
    schema=StructType(
        [
            StructField("first_id", IntegerType()),
            StructField("second_id", IntegerType()),
        ]
    ),
).createOrReplaceTempView("tmp")

# Merge into
spark.sql(
    """
    MERGE INTO new_ns.new_table A 
    USING (SELECT * FROM tmp) B 
    ON A.first_id=B.first_id
    WHEN MATCHED THEN UPDATE SET A.first_id=B.first_id, A.second_id=B.second_id
    WHEN NOT MATCHED THEN INSERT *
    """
)

spark.sql("SELECT * FROM new_ns.new_table;").show()
#+--------+---------+
#|first_id|second_id|
#+--------+---------+
#|       1|        1|
#|       2|        1|
#|       3|        1|
#+--------+---------+

# Now inspect the .files to check the sort_order_id
spark.sql(
    "SELECT partition, record_count, sort_order_id FROM new_ns.new_table.files"
).show()
#+---------+------------+-------------+
#|partition|record_count|sort_order_id|
#+---------+------------+-------------+
#|      {1}|           3|            0|
#+---------+------------+-------------+

It shows that the wrong sort_order_id is being used.

I am not entirely sure if this is just wrongly reported or that it actually uses the wrong sort_order_id. Does anyone have more information on this? Thanks in advance!

Not sure what does that mean but I can see the same in my tables i.e. sort_order_id value as 0 for all my partitions. — Atif, May 30 '23 at 04:24
I actually posted this question on Github as well. They also provide a way to check if it is actually being respected: https://github.com/apache/iceberg/issues/7600. — thijsvdp, May 30 '23 at 10:25
Thanks. I have a query related to sorting that if I use binpack and on table I already have sorting then whether binpack will sort the data or do I need to use sort options of rewrite ? — Atif, May 31 '23 at 12:01

Apache Iceberg Sort order id not being respected in Spark

0 Answers0