diff --git a/Cargo.lock b/Cargo.lock index ff85db24a7c..f6b2d19c29a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -196,25 +196,57 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "arrow" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb98341a7e051bb79731ecb33ec00cbd6e0e315a542d6732b46d462c9215ea2" +dependencies = [ + "arrow-arith 56.2.1", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-cast 56.2.1", + "arrow-data 56.2.1", + "arrow-ord 56.2.1", + "arrow-row 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "arrow-string 56.2.1", +] + [[package]] name = "arrow" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", "arrow-csv", - "arrow-data", - "arrow-ipc", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-ord 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", + "arrow-string 58.3.0", +] + +[[package]] +name = "arrow-arith" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce4751cbc4bcccfeeea79df9571ff1dc066d61e44723c7604d11c7937f5b560" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "chrono", + "num", ] [[package]] @@ -223,14 +255,30 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "num-traits", ] +[[package]] +name = "arrow-array" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b02ccba2e977a3aabb4384036109ca32f552399a2bc0588f925f91ed073ce70c" +dependencies = [ + "ahash 0.8.12", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "chrono", + "half", + "hashbrown 0.16.1", + "num", +] + [[package]] name = "arrow-array" version = "58.3.0" @@ -238,9 +286,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" dependencies = [ "ahash 0.8.12", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "chrono-tz", "half", @@ -256,9 +304,9 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "049230728cd6e093088c8d231b4beede184e35cad7777c1505c0d5a8571f4376" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "bytes", "bzip2", "crc", @@ -274,6 +322,17 @@ dependencies = [ "zstd", ] +[[package]] +name = "arrow-buffer" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a90f8bece6a9ee316a699fbbfde368a206676a1206ce89b50f07937648e76c3c" +dependencies = [ + "bytes", + "half", + "num", +] + [[package]] name = "arrow-buffer" version = "58.3.0" @@ -286,18 +345,39 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-cast" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61ffe645cfb4e80b1ca37a3a106ce7b4af66ccdd60c655a57e6b9aab096164a7" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", + "ryu", +] + [[package]] name = "arrow-cast" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "atoi", "base64", "chrono", @@ -314,41 +394,67 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" dependencies = [ - "arrow-array", - "arrow-cast", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "chrono", "csv", "csv-core", "regex", ] +[[package]] +name = "arrow-data" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78468c813909465dd0f858950c8a0614eb63608134acf95c602ec21381258b28" +dependencies = [ + "arrow-buffer 56.2.1", + "arrow-schema 56.2.1", + "half", + "num", +] + [[package]] name = "arrow-data" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "half", "num-integer", "num-traits", ] +[[package]] +name = "arrow-ipc" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31f88b0fbb33af28089ccd3e4dcd0ff09de46842168d00220b920f7231feddf5" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "flatbuffers", +] + [[package]] name = "arrow-ipc" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "flatbuffers", - "lz4_flex", + "lz4_flex 0.13.1", "zstd", ] @@ -358,12 +464,12 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -377,17 +483,43 @@ dependencies = [ "simdutf8", ] +[[package]] +name = "arrow-ord" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aed58a38c3db0a2cf75ef70e3cb6bc4bd0da0a3d390de37c36139b31fae826e8" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", +] + [[package]] name = "arrow-ord" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", +] + +[[package]] +name = "arrow-row" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "079ced0517daf4f09b070d09ff641cee7cc331aa216bebcb25d1a6474ad53086" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "half", ] [[package]] @@ -396,13 +528,19 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "half", ] +[[package]] +name = "arrow-schema" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a0d5eb3fe25337ff83e8333a08379bdd1540b0961b1c888f6e505d971c198e1" + [[package]] name = "arrow-schema" version = "58.3.0" @@ -414,6 +552,20 @@ dependencies = [ "serde_json", ] +[[package]] +name = "arrow-select" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2368a78bd32902dba39d52519d70f63799c8b5dc8a9477129a30c2fd3dc70c19" +dependencies = [ + "ahash 0.8.12", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "num", +] + [[package]] name = "arrow-select" version = "58.3.0" @@ -421,24 +573,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "num-traits", ] +[[package]] +name = "arrow-string" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dece58a130b9187756ded8bc071bd8ee9dd7a146566af244b297c7e632fd1ef7" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "memchr", + "num", + "regex", + "regex-syntax", +] + [[package]] name = "arrow-string" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "memchr", "num-traits", "regex", @@ -1355,11 +1524,12 @@ dependencies = [ [[package]] name = "comfy-table" -version = "7.1.4" +version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ - "unicode-segmentation", + "strum 0.26.3", + "strum_macros 0.26.4", "unicode-width 0.2.2", ] @@ -1382,8 +1552,8 @@ name = "compress-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bytes", "clap", @@ -1391,7 +1561,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "lance-bench", - "parquet", + "parquet 58.3.0", "regex", "tokio", "tracing", @@ -1907,8 +2077,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bytes", "chrono", @@ -1956,8 +2126,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "997a31e15872606a49478e670c58302094c97cb96abb0a7d60720f8e92170040" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bzip2", "chrono", @@ -1995,7 +2165,7 @@ dependencies = [ "log", "object_store 0.13.2", "parking_lot", - "parquet", + "parquet 58.3.0", "sqlparser 0.62.0", "tempfile", "tokio", @@ -2015,6 +2185,7 @@ dependencies = [ "datafusion-common 54.0.0", "datafusion-physical-plan 54.0.0", "futures", + "geodatafusion", "itertools 0.14.0", "object_store 0.13.2", "opentelemetry", @@ -2027,6 +2198,7 @@ dependencies = [ "vortex-bench", "vortex-cuda", "vortex-datafusion", + "vortex-geo", "vortex-metrics", ] @@ -2036,7 +2208,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "dashmap", "datafusion-common 53.1.0", @@ -2061,7 +2233,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7dd61161508f8f5fa1107774ea687bd753c22d83a32eebf963549f89de14139" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "dashmap", "datafusion-common 54.0.0", @@ -2086,7 +2258,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 53.1.0", "datafusion-common 53.1.0", @@ -2109,7 +2281,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897c70f871277f9ce99aa38347be0d679bbe3e617156c4d2a8378cec8a2a0891" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 54.0.0", "datafusion-common 54.0.0", @@ -2133,8 +2305,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash 0.8.12", - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "chrono", "half", "hashbrown 0.16.1", @@ -2155,9 +2327,9 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "121c9ded5d87d9172319e006f2afdb9928d72dbacd6a90a458d8acb1e3b43a65" dependencies = [ - "arrow", - "arrow-ipc", - "arrow-schema", + "arrow 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "chrono", "foldhash 0.2.0", "half", @@ -2167,7 +2339,7 @@ dependencies = [ "libc", "log", "object_store 0.13.2", - "parquet", + "parquet 58.3.0", "recursive", "sqlparser 0.62.0", "tokio", @@ -2203,7 +2375,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "chrono", @@ -2232,7 +2404,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffd7d295b2ec7c00d8a56562f41ed41062cf0af75549ed891c12a0a09eddfefe" dependencies = [ - "arrow", + "arrow 58.3.0", "async-compression", "async-trait", "bytes", @@ -2268,8 +2440,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2292,8 +2464,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552b0b3f342f7ec41b3fbd70f6339dc82a30cfd0349e7f280e7852528085349f" dependencies = [ - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2316,7 +2488,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb517d08967d536284ce70afb5fe8583133779249f2d7b90587d339741a7f195" dependencies = [ - "arrow", + "arrow 58.3.0", "arrow-avro", "async-trait", "bytes", @@ -2335,7 +2507,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2358,7 +2530,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68850aa426b897e879c8b87e512ea8124f1d0a2869a4e51808ddaaddf1bc0ada" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2381,7 +2553,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2405,7 +2577,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "402f93242ae08ef99139ee2c528a49d087efe88d5c7b2c3ff5480855a40ce54f" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2428,7 +2600,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffd2499c1bee0eeccf6a57156105700eeeb17bc701899ac719183c4e74231450" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2449,7 +2621,7 @@ dependencies = [ "log", "object_store 0.13.2", "parking_lot", - "parquet", + "parquet 58.3.0", "tokio", ] @@ -2471,8 +2643,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "async-trait", "chrono", "dashmap", @@ -2494,8 +2666,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37a8643ab852eb68864e1b72ae789e8066282dce48eea6347ffb0aee33d1ccc0" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "async-trait", "dashmap", "datafusion-common 54.0.0", @@ -2516,7 +2688,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "chrono", "datafusion-common 53.1.0", @@ -2538,8 +2710,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6932f4d71eed9c8d9341476a2b845aadfabde5495d08dbcd8fc23881f49fa7a0" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "chrono", "datafusion-common 54.0.0", @@ -2561,7 +2733,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "indexmap 2.14.0", "itertools 0.14.0", @@ -2574,7 +2746,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0225491839a31b1f7d2cb8092c2d50792e2fe1c1724e4e6d08e011f5feaf4ed2" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "indexmap 2.14.0", "itertools 0.14.0", @@ -2586,8 +2758,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "base64", "blake2", "blake3", @@ -2618,8 +2790,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14872c47bfc3d21e53ec82f57074e6987a15941c1e2f43cde4ac6ae2746634e3" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "base64", "blake2", "blake3", @@ -2651,7 +2823,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-execution 53.1.0", @@ -2672,7 +2844,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75a2ca14e1b609be21e657e2d3130b2f446456b08393b377bb721a33952d2e09" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-execution 54.0.0", @@ -2694,7 +2866,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr-common 53.1.0", "datafusion-physical-expr-common 53.1.0", @@ -2706,7 +2878,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ece74ba09092d2ef9c9b54a38445450aea292a1f8b04faf531936b723a24b3c" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr-common 54.0.0", "datafusion-physical-expr-common 54.0.0", @@ -2718,8 +2890,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ - "arrow", - "arrow-ord", + "arrow 58.3.0", + "arrow-ord 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-execution 53.1.0", @@ -2743,8 +2915,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f3e3f9ee8ca59bf70518802107de6f1b88a9509efdc629fadc5de9d6b2d5ef5" dependencies = [ - "arrow", - "arrow-ord", + "arrow 58.3.0", + "arrow-ord 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-execution 54.0.0", @@ -2768,7 +2940,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 53.1.0", "datafusion-common 53.1.0", @@ -2784,7 +2956,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89161dffc22cf2b50f9f4b1bee83b5221d3b4ed7c2e37fd7aa2b22a5297b3a26" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 54.0.0", "datafusion-common 54.0.0", @@ -2800,7 +2972,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-expr 53.1.0", @@ -2818,7 +2990,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7339345b226b3874037708bf5023ba1c2de705128f8457a095aae5ae9cb9c78" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-expr 54.0.0", @@ -2877,7 +3049,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", @@ -2896,7 +3068,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77f20e8cf9e8654d92f4c16b24c487353ee5bf153ffc12d5772cd399ab8cd281" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", @@ -2917,7 +3089,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", "datafusion-expr-common 53.1.0", @@ -2939,7 +3111,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f015a4a82f6f7ff7e1d8d4bf3870a936752fa38b17705dfcc14adef95aa8922c" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", "datafusion-expr-common 54.0.0", @@ -2961,7 +3133,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", "datafusion-functions 53.1.0", @@ -2976,7 +3148,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51e6ffff8acdfe54e0ea15ccf38115c4a9184433b0439f42907637928d00a235" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", "datafusion-functions 54.0.0", @@ -2992,7 +3164,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 53.1.0", "datafusion-expr-common 53.1.0", @@ -3008,7 +3180,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7967a3e171c6a4bf09474b3f7a14f1a3db13ed1714ba12156f33fcce2bba54e8" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 54.0.0", "datafusion-expr-common 54.0.0", @@ -3025,7 +3197,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-execution 53.1.0", "datafusion-expr 53.1.0", @@ -3043,7 +3215,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59ff803e2a96054cb6d83f35f9e60fd4f42eac515e1932bd1b2dbc91d5fcbf36" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-execution 54.0.0", "datafusion-expr 54.0.0", @@ -3063,9 +3235,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash 0.8.12", - "arrow", - "arrow-ord", - "arrow-schema", + "arrow 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion-common 53.1.0", "datafusion-common-runtime 53.1.0", @@ -3094,11 +3266,11 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "776ee54d47d15bdb126452f9ca17b03761e3b004682914beaedd3f86eb507fbc" dependencies = [ - "arrow", - "arrow-data", - "arrow-ipc", - "arrow-ord", - "arrow-schema", + "arrow 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion-common 54.0.0", "datafusion-common-runtime 54.0.0", @@ -3127,7 +3299,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-datasource 53.1.0", "datafusion-expr-common 53.1.0", @@ -3144,7 +3316,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5fb9e5774660aa69c3ba93c610f175f75b65cb8c3776edb3626de8f3a4f4ee3" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-datasource 54.0.0", "datafusion-expr-common 54.0.0", @@ -3188,7 +3360,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "390bb0bf37cb2b95ffd65eacd66f60df50793d1f94097799e416f39477a51957" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "crc32fast", @@ -3218,7 +3390,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "datafusion-common 53.1.0", @@ -3236,7 +3408,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6094ad36a3ed6d7ac87b20b479b2d0b118250f66cf997603829fdc65b44a7099" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "datafusion-common 54.0.0", @@ -3255,7 +3427,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0c08025966108056d3547d879c4d39e246277494f59ca12920f78187d95eea1" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bigdecimal", "clap", @@ -3767,7 +3939,7 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83cf860f6a6bf0a6a60fdfe5a36c75121fad5ea4332d1d12deee3e65b6047727" dependencies = [ - "arrow-array", + "arrow-array 58.3.0", "rand 0.9.4", ] @@ -3992,14 +4164,27 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dafe7b7de3fab1a8b7099fd6a6434ca955fa65065f9c19f0f8a133693f3c2b0e" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "geo-traits", "geoarrow-schema", "num-traits", "wkb", - "wkt", + "wkt 0.14.0", +] + +[[package]] +name = "geoarrow-cast" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41c308d653690a4e8ef3cbba69696056bd819e624766ece66d64cc26a638acc1" +dependencies = [ + "arrow-schema 58.3.0", + "geo-traits", + "geoarrow-array", + "geoarrow-schema", + "wkt 0.14.0", ] [[package]] @@ -4008,8 +4193,8 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e4a62ac19c86827c6ec81ea584594b3ee96db5a8119b9774d3466c6b373c434" dependencies = [ - "arrow-array", - "arrow-buffer", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", "geo", "geo-traits", "geoarrow-array", @@ -4022,7 +4207,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d4a7edb2a1d87024a93805332a9c8184a0354836271d42c0d18cf628a5e3cd0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "geo-traits", "serde", "serde_json", @@ -4034,9 +4219,9 @@ name = "geodatafusion" version = "0.4.0" source = "git+https://github.com/HarukiMoriarty/geodatafusion?rev=3d50d7e549df720707133852848edd1ecff89265#3d50d7e549df720707133852848edd1ecff89265" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-schema", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "datafusion 54.0.0", "geo", "geo-traits", @@ -4045,7 +4230,7 @@ dependencies = [ "geoarrow-schema", "geohash", "thiserror 1.0.69", - "wkt", + "wkt 0.14.0", ] [[package]] @@ -4067,6 +4252,33 @@ dependencies = [ "libm", ] +[[package]] +name = "geojson" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e26f3c45b36fccc9cf2805e61d4da6bc4bbd5a3a9589b01afa3a40eff703bd79" +dependencies = [ + "log", + "serde", + "serde_json", + "thiserror 2.0.18", +] + +[[package]] +name = "geozero" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5f28f34864745eb2f123c990c6ffd92c1584bd39439b3f27ff2a0f4ea5b309b" +dependencies = [ + "geo-types", + "geojson", + "log", + "scroll", + "serde_json", + "thiserror 1.0.69", + "wkt 0.11.1", +] + [[package]] name = "get_dir" version = "0.5.0" @@ -4986,16 +5198,16 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d34e854994e84d043897f5ec9fb609221e9e69e3fd52996cd715d979fcd349f6" dependencies = [ - "arrow", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ipc", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "async_cell", @@ -5054,14 +5266,14 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7827fe404358c27d120ee8ea8ef7b9415c2911d54072bec83dd689d750ae65da" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ipc", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "bytes", "futures", "getrandom 0.2.17", @@ -5076,13 +5288,13 @@ name = "lance-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-cast", + "arrow-cast 58.3.0", "async-trait", "clap", "futures", "lance", "lance-encoding", - "parquet", + "parquet 58.3.0", "tempfile", "tokio", "tracing", @@ -5106,9 +5318,9 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b128c213c676cb8e03c62a68670642770825171e64097cc2da97cbb19fe35d29" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "async-trait", "byteorder", "bytes", @@ -5145,13 +5357,13 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e03b2de71cbcd09b10bf1a17c83cacbc0176ecd97203fb72b9e59d9b8f9a3743" dependencies = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-trait", "chrono", "datafusion 53.1.0", @@ -5178,10 +5390,10 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fe7c7ea7fd397e495a1646fec360e46ee0cbd75718f1c0e887aad657c5f2944" dependencies = [ - "arrow", - "arrow-array", - "arrow-cast", - "arrow-schema", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "chrono", "futures", "half", @@ -5198,13 +5410,13 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe3f8070835b407d8db9ea8728386bc3207ba23c66a9c22d344e231ef12b77ca" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "bytemuck", "byteorder", "bytes", @@ -5237,12 +5449,12 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6dfcf654549330df3aef708cd7c12e170feecddd34d6c19dd005b4153213268" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "byteorder", @@ -5271,12 +5483,12 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb8ad0bd10efa2608634a2518b7dd501231e76c56a65fbd6519e23914cc425a" dependencies = [ - "arrow", - "arrow-arith", - "arrow-array", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-channel", "async-recursion", "async-trait", @@ -5337,14 +5549,14 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef5314703fa8c8baed04193cc669da80ab42521c6319d3cc921a4a997690dcc0" dependencies = [ - "arrow", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "byteorder", @@ -5379,9 +5591,9 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51aa9b73279f505b2bec0f194c7a2390ca74ad3260131e631a7bef8d97d54b2e" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "cc", "deepsize", "half", @@ -5397,7 +5609,7 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cd01581f55ce45c49cbe494ee86c7ba7ca4ca3654690fd820941cd9105a46e" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "lance-core", @@ -5426,11 +5638,11 @@ version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5db70650465a1af174b7dfe6948ec91a3d466ada12e11274eb66e51132173aa0" dependencies = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-ipc", - "arrow-schema", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "async-trait", "byteorder", "bytes", @@ -5770,6 +5982,15 @@ dependencies = [ "libc", ] +[[package]] +name = "lz4_flex" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" +dependencies = [ + "twox-hash", +] + [[package]] name = "lz4_flex" version = "0.13.1" @@ -6109,6 +6330,20 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -6143,6 +6378,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -6521,6 +6778,41 @@ dependencies = [ "windows-link", ] +[[package]] +name = "parquet" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3abbfef8a25900f4925c86e4cb881ea24672ca3c31ee4fb50a8083c4c56d313" +dependencies = [ + "ahash 0.8.12", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-cast 56.2.1", + "arrow-data 56.2.1", + "arrow-ipc 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.16.1", + "lz4_flex 0.11.6", + "num", + "num-bigint", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + [[package]] name = "parquet" version = "58.3.0" @@ -6528,12 +6820,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "base64", "brotli", "bytes", @@ -6542,7 +6834,7 @@ dependencies = [ "futures", "half", "hashbrown 0.17.1", - "lz4_flex", + "lz4_flex 0.13.1", "num-bigint", "num-integer", "num-traits", @@ -6563,8 +6855,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74c8db065291f088a2aad8ab831853eae1871c0d311c8d0b83bbc3b7e735d0fc" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -6579,8 +6871,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a530e8d5b5e14efcb39c9a6ec55432ad11f6afb7dc4455a79be0dc615fe3cc31" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -6596,7 +6888,7 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00ed89908289f67caa2ca078f9ff9aacd6229a313ec92b12bf4f48f613dc2b97" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "base64", "chrono", "parquet-variant", @@ -7983,6 +8275,12 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2" +[[package]] +name = "scroll" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04c565b551bafbef4157586fa379538366e4385d42082f255bfd96e4fe8519da" + [[package]] name = "seahash" version = "4.1.0" @@ -8404,6 +8702,30 @@ dependencies = [ "smallvec", ] +[[package]] +name = "spatialbench" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07f3f4b67ccf571f183d3695aa6b9d6f996864c31782a480e97a23ed0f2f6f18" +dependencies = [ + "geo", + "once_cell", + "rand 0.8.6", + "serde", +] + +[[package]] +name = "spatialbench-arrow" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad89c32ed9e258bcc89713c296c7437963ce31f511eb8a408d2046e853294206" +dependencies = [ + "arrow 56.2.1", + "geo", + "geozero", + "spatialbench", +] + [[package]] name = "sqllogictest" version = "0.29.1" @@ -9141,7 +9463,7 @@ name = "tpchgen-arrow" version = "2.0.2" source = "git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f#438e9c2dbc25b2fff82c0efc08b3f13b5707874f" dependencies = [ - "arrow", + "arrow 58.3.0", "tpchgen", ] @@ -9419,12 +9741,12 @@ name = "vortex" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", + "arrow-array 58.3.0", "codspeed-divan-compat", "fastlanes", "futures", "mimalloc", - "parquet", + "parquet 58.3.0", "paste", "rand 0.10.1", "rand_distr 0.6.0", @@ -9490,15 +9812,15 @@ dependencies = [ "arbitrary", "arc-swap", "arcref", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", + "arrow-string 58.3.0", "async-lock", "bytes", "cfg-if", @@ -9562,13 +9884,15 @@ name = "vortex-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-trait", "bzip2", "clap", "futures", + "geoarrow", + "geoarrow-cast", "get_dir", "glob", "humansize", @@ -9579,12 +9903,15 @@ dependencies = [ "noodles-bgzf", "noodles-vcf", "parking_lot", - "parquet", + "parquet 56.2.1", + "parquet 58.3.0", "rand 0.10.1", "regex", "reqwest 0.13.4", "serde", "serde_json", + "spatialbench", + "spatialbench-arrow", "sysinfo", "tabled", "target-lexicon", @@ -9599,6 +9926,7 @@ dependencies = [ "url", "uuid", "vortex", + "vortex-geo", "vortex-tensor", ] @@ -9637,7 +9965,7 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ - "arrow-buffer", + "arrow-buffer 58.3.0", "bitvec", "bytes", "codspeed-divan-compat", @@ -9668,13 +9996,13 @@ dependencies = [ name = "vortex-compat" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-select", + "arrow-array 58.3.0", + "arrow-select 58.3.0", "base16ct", "bytes", "clap", "futures", - "parquet", + "parquet 58.3.0", "reqwest 0.13.4", "serde", "serde_json", @@ -9727,7 +10055,7 @@ name = "vortex-cuda" version = "0.1.0" dependencies = [ "arc-swap", - "arrow-schema", + "arrow-schema 58.3.0", "async-trait", "bindgen", "bytes", @@ -9758,7 +10086,7 @@ dependencies = [ name = "vortex-cuda-ffi" version = "0.1.0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "futures", "vortex", "vortex-array", @@ -9780,8 +10108,8 @@ name = "vortex-cxx" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "cxx", "futures", @@ -9795,8 +10123,8 @@ name = "vortex-datafusion" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion 54.0.0", "datafusion-catalog 54.0.0", @@ -9897,7 +10225,7 @@ dependencies = [ name = "vortex-error" version = "0.1.0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "flatbuffers", "jiff", "object_store 0.13.2", @@ -9932,8 +10260,8 @@ dependencies = [ name = "vortex-ffi" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "cbindgen", "futures", @@ -10052,8 +10380,8 @@ dependencies = [ name = "vortex-geo" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "geo-traits", "geo-types", "geoarrow", @@ -10118,8 +10446,8 @@ dependencies = [ name = "vortex-jni" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "futures", "jni", @@ -10138,8 +10466,8 @@ dependencies = [ name = "vortex-json" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "vortex-array", "vortex-error", "vortex-session", @@ -10150,8 +10478,8 @@ name = "vortex-layout" version = "0.1.0" dependencies = [ "arcref", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-stream", "async-trait", "bit-vec", @@ -10241,9 +10569,9 @@ dependencies = [ name = "vortex-parquet-variant" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "chrono", "parquet-variant", "parquet-variant-compute", @@ -10287,9 +10615,9 @@ dependencies = [ name = "vortex-python" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "async-fs", "bytes", "itertools 0.14.0", @@ -10311,9 +10639,9 @@ dependencies = [ name = "vortex-row" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-row", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", "bytes", "codspeed-divan-compat", "mimalloc", @@ -10332,8 +10660,8 @@ name = "vortex-runend" version = "0.1.0" dependencies = [ "arbitrary", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "itertools 0.14.0", "num-traits", @@ -10431,8 +10759,8 @@ dependencies = [ name = "vortex-tensor" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "half", "itertools 0.14.0", @@ -10455,8 +10783,8 @@ dependencies = [ name = "vortex-test-e2e-cuda" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "futures", "vortex", "vortex-cuda", @@ -10467,8 +10795,8 @@ name = "vortex-tui" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "clap", "console_error_panic_hook", "crossterm", @@ -10481,7 +10809,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "js-sys", - "parquet", + "parquet 58.3.0", "ratatui", "ratzilla", "serde", @@ -10508,9 +10836,9 @@ dependencies = [ name = "vortex-web-wasm" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-ipc", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "console_error_panic_hook", "futures", "js-sys", @@ -11185,6 +11513,18 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "wkt" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54f7f1ff4ea4c18936d6cd26a6fd24f0003af37e951a8e0e8b9e9a2d0bd0a46d" +dependencies = [ + "geo-types", + "log", + "num-traits", + "thiserror 1.0.69", +] + [[package]] name = "wkt" version = "0.14.0" diff --git a/Cargo.toml b/Cargo.toml index 8c4e9e01b1e..7d4f81161fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -160,6 +160,7 @@ fuzzy-matcher = "0.3" geo-traits = "0.3.0" geo-types = "0.7.19" geoarrow = "0.8.0" +geoarrow-cast = "0.8.0" # Temporary fork bumped to DataFusion 54 until the upstream PR lands; pinned to an exact rev. geodatafusion = { git = "https://github.com/HarukiMoriarty/geodatafusion", rev = "3d50d7e549df720707133852848edd1ecff89265" } get_dir = "0.5.0" @@ -241,6 +242,14 @@ similar = "3.0.0" sketches-ddsketch = "0.4.0" smallvec = "1.15.1" smol = "2.0.2" +spatialbench = "0.2" +spatialbench-arrow = "0.2" +# spatialbench still pins arrow 56, two majors behind the workspace arrow. Until upstream +# catches up, write its generated batches with a matching parquet instead of converting +# arrow versions at the boundary. +spatialbench-parquet = { package = "parquet", version = "56", features = [ + "async", +] } static_assertions = "1.1" strum = "0.28" syn = { version = "2.0.117", features = ["full"] } diff --git a/benchmarks/datafusion-bench/Cargo.toml b/benchmarks/datafusion-bench/Cargo.toml index d0015dd2995..e2d387fe89f 100644 --- a/benchmarks/datafusion-bench/Cargo.toml +++ b/benchmarks/datafusion-bench/Cargo.toml @@ -27,6 +27,7 @@ datafusion = { workspace = true, features = [ datafusion-common = { workspace = true } datafusion-physical-plan = { workspace = true } futures.workspace = true +geodatafusion = { workspace = true } itertools.workspace = true object_store = { workspace = true, features = ["aws", "gcp"] } opentelemetry.workspace = true @@ -39,6 +40,7 @@ vortex = { workspace = true, features = ["object_store", "files", "tokio"] } vortex-bench = { workspace = true } vortex-cuda = { workspace = true, optional = true } vortex-datafusion = { workspace = true } +vortex-geo = { workspace = true } vortex-metrics = { workspace = true } [build-dependencies] diff --git a/benchmarks/datafusion-bench/src/lib.rs b/benchmarks/datafusion-bench/src/lib.rs index c8353eb1f85..8f4fe3bbe85 100644 --- a/benchmarks/datafusion-bench/src/lib.rs +++ b/benchmarks/datafusion-bench/src/lib.rs @@ -24,6 +24,7 @@ use object_store::aws::AmazonS3Builder; use object_store::gcp::GoogleCloudStorageBuilder; use object_store::local::LocalFileSystem; use url::Url; +use vortex_bench::BenchmarkArg; use vortex_bench::Format; use vortex_bench::SESSION; use vortex_datafusion::VortexFormat; @@ -31,7 +32,7 @@ use vortex_datafusion::VortexFormatFactory; use vortex_datafusion::VortexTableOptions; #[expect(clippy::expect_used)] -pub fn get_session_context() -> SessionContext { +pub fn get_session_context(benchmark: BenchmarkArg) -> SessionContext { let mut rt_builder = RuntimeEnvBuilder::new(); let file_static_cache = Arc::new(DefaultFileStatisticsCache::default()); @@ -45,13 +46,27 @@ pub fn get_session_context() -> SessionContext { .build_arc() .expect("could not build runtime environment"); - let factory = VortexFormatFactory::new().with_options(VortexTableOptions { - projection_pushdown: true, - ..Default::default() - }); + let factory = VortexFormatFactory::new_with_options( + SESSION.clone(), + VortexTableOptions { + projection_pushdown: true, + ..Default::default() + }, + ); + + let mut config = SessionConfig::from_env().expect("shouldn't fail"); + // SpatialBench reads geoarrow.point Parquet and benchmarks an ST_* predicate, so it needs + // Parquet-specific tuning. + if matches!(benchmark, BenchmarkArg::SpatialBench) { + // Keep Parquet field metadata so the geoarrow.point extension survives the read. + config.options_mut().execution.parquet.skip_metadata = false; + // Evaluate (and reorder) the filter inside the parquet scan -- fairest parquet baseline. + config.options_mut().execution.parquet.pushdown_filters = true; + config.options_mut().execution.parquet.reorder_filters = true; + } let mut session_state_builder = SessionStateBuilder::new() - .with_config(SessionConfig::from_env().expect("shouldn't fail")) + .with_config(config) .with_runtime_env(rt) .with_default_features(); @@ -66,7 +81,10 @@ pub fn get_session_context() -> SessionContext { file_formats.push(Arc::new(factory)); } - SessionContext::new_with_state(session_state_builder.build()) + let ctx = SessionContext::new_with_state(session_state_builder.build()); + // Register geodatafusion's PostGIS-style ST_* UDFs so SpatialBench SQL plans. + geodatafusion::register(&ctx); + ctx } pub fn make_object_store( @@ -109,11 +127,16 @@ pub fn make_object_store( } } -pub fn format_to_df_format(format: Format) -> Arc { +pub fn format_to_df_format(format: Format, benchmark: BenchmarkArg) -> Arc { match format { Format::Csv => Arc::new(CsvFormat::default()) as _, Format::Arrow => Arc::new(ArrowFormat), - Format::Parquet => Arc::new(ParquetFormat::new()), + Format::Parquet => { + // SpatialBench needs Parquet field metadata to rebuild the geoarrow.point extension + // during schema inference; other benchmarks keep the DataFusion default. + let skip_metadata = !matches!(benchmark, BenchmarkArg::SpatialBench); + Arc::new(ParquetFormat::new().with_skip_metadata(skip_metadata)) + } Format::OnDiskVortex | Format::VortexCompact => { Arc::new(VortexFormat::new(SESSION.clone())) } diff --git a/benchmarks/datafusion-bench/src/main.rs b/benchmarks/datafusion-bench/src/main.rs index b8f9ac42df6..1b890651d02 100644 --- a/benchmarks/datafusion-bench/src/main.rs +++ b/benchmarks/datafusion-bench/src/main.rs @@ -27,6 +27,7 @@ use datafusion_physical_plan::collect; use futures::StreamExt; use parking_lot::Mutex; use tokio::fs::File; +use vortex::array::arrow::ArrowSessionExt; use vortex::io::filesystem::FileSystemRef; use vortex::scan::DataSourceRef; use vortex_bench::Benchmark; @@ -188,9 +189,9 @@ async fn main() -> anyhow::Result<()> { |format| { let benchmark = &*benchmark; async move { - let session = datafusion_bench::get_session_context(); + let session = datafusion_bench::get_session_context(args.benchmark); datafusion_bench::make_object_store(&session, benchmark.data_url())?; - register_benchmark_tables(&session, benchmark, format).await?; + register_benchmark_tables(&session, benchmark, format, args.benchmark).await?; Ok((session, format)) } }, @@ -254,6 +255,7 @@ async fn register_benchmark_tables( session: &SessionContext, benchmark: &B, format: Format, + benchmark_arg: BenchmarkArg, ) -> anyhow::Result<()> { match format { Format::Arrow => register_arrow_tables(session, benchmark).await, @@ -261,8 +263,8 @@ async fn register_benchmark_tables( register_v2_tables(session, benchmark, format).await } _ => { - let benchmark_base = benchmark.data_url().join(&format!("{}/", format.name()))?; - let file_format = format_to_df_format(format); + let benchmark_base = benchmark.format_path(format, benchmark.data_url())?; + let file_format = format_to_df_format(format, benchmark_arg); for table in benchmark.table_specs().iter() { let pattern = benchmark.pattern(table.name, format); @@ -307,7 +309,7 @@ async fn register_v2_tables( use vortex::scan::DataSource as _; use vortex_datafusion::v2::VortexTable; - let benchmark_base = benchmark.data_url().join(&format!("{}/", format.name()))?; + let benchmark_base = benchmark.format_path(format, benchmark.data_url())?; for table in benchmark.table_specs().iter() { let pattern = benchmark.pattern(table.name, format); @@ -334,7 +336,7 @@ async fn register_v2_tables( .build() .await?; - let arrow_schema = Arc::new(multi_ds.dtype().to_arrow_schema()?); + let arrow_schema = Arc::new(SESSION.arrow().to_arrow_schema(multi_ds.dtype())?); let data_source: DataSourceRef = Arc::new(multi_ds); let table_provider = Arc::new(VortexTable::new(data_source, SESSION.clone(), arrow_schema)); diff --git a/vortex-bench/Cargo.toml b/vortex-bench/Cargo.toml index 3b793c6124a..0187bdb986e 100644 --- a/vortex-bench/Cargo.toml +++ b/vortex-bench/Cargo.toml @@ -23,6 +23,7 @@ vortex = { workspace = true, features = [ "tokio", "zstd", ] } +vortex-geo = { workspace = true } vortex-tensor = { workspace = true } # TODO(connor): In the future, this might be inside vortex. anyhow = { workspace = true } @@ -33,6 +34,8 @@ async-trait = { workspace = true } bzip2 = { workspace = true } clap = { workspace = true, features = ["derive"] } futures = { workspace = true } +geoarrow = { workspace = true } +geoarrow-cast = { workspace = true } get_dir = { workspace = true } glob = { workspace = true } humansize = { workspace = true } @@ -48,6 +51,9 @@ regex = { workspace = true } reqwest = { workspace = true, features = ["stream"] } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } +spatialbench = { workspace = true } +spatialbench-arrow = { workspace = true } +spatialbench-parquet = { workspace = true } sysinfo = { workspace = true } tabled = { workspace = true, features = ["std"] } target-lexicon = { workspace = true } diff --git a/vortex-bench/spatialbench.sql b/vortex-bench/spatialbench.sql new file mode 100644 index 00000000000..a55ba851cd6 --- /dev/null +++ b/vortex-bench/spatialbench.sql @@ -0,0 +1,228 @@ +-- SpatialBench queries (Apache Sedona), WKB dialect. See sedona-spatialbench/docs/queries.md. +-- Numbered from Q0 (= SpatialBench Q1). Only Q0 is wired up today, the rest are not run yet. + +-- Q0: Find trips starting within 50km of the Sedona city center, ranked by distance. +SELECT + t_tripkey, + ST_X(ST_GeomFromWKB(t_pickuploc)) AS pickup_lon, + ST_Y(ST_GeomFromWKB(t_pickuploc)) AS pickup_lat, + t_pickuptime, + ST_Distance(ST_GeomFromWKB(t_pickuploc), ST_Point(-111.7610::double, 34.8697::double)) AS distance_to_center +FROM trip +WHERE ST_Distance(ST_GeomFromWKB(t_pickuploc), ST_Point(-111.7610::double, 34.8697::double)) <= 0.45::double +ORDER BY distance_to_center ASC, t_tripkey ASC; + +-- Q1: Count trips starting within Coconino County (Arizona) zone. +SELECT COUNT(*) AS trip_count_in_coconino_county +FROM trip t +WHERE ST_Intersects( + ST_GeomFromWKB(t.t_pickuploc), + ( + SELECT ST_GeomFromWKB(z.z_boundary) + FROM zone z + WHERE z.z_name = 'Coconino County' + LIMIT 1 + ) +); + +-- Q2: Monthly trip statistics within a 15km radius of the Sedona city center. +SELECT + DATE_TRUNC('month', t.t_pickuptime) AS pickup_month, + COUNT(t.t_tripkey) AS total_trips, + AVG(t.t_distance) AS avg_distance, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration, + AVG(t.t_fare) AS avg_fare +FROM trip t +WHERE ST_DWithin( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromText('POLYGON(( + -111.9060 34.7347, -111.6160 34.7347, + -111.6160 35.0047, -111.9060 35.0047, + -111.9060 34.7347 + ))'), -- Bounding box around Sedona + 0.045 -- Additional 5km buffer in degrees +) +GROUP BY pickup_month +ORDER BY pickup_month; + +-- Q3: Zone distribution of top 1000 trips by tip amount. +SELECT + z.z_zonekey, + z.z_name, + COUNT(*) AS trip_count +FROM + zone z + JOIN ( + SELECT t.t_pickuploc + FROM trip t + ORDER BY t.t_tip DESC, t.t_tripkey ASC + LIMIT 1000 + ) top_trips + ON ST_Within( + ST_GeomFromWKB(top_trips.t_pickuploc), + ST_GeomFromWKB(z.z_boundary) + ) +GROUP BY z.z_zonekey, z.z_name +ORDER BY trip_count DESC, z.z_zonekey ASC; + +-- Q4: Monthly travel patterns for repeat customers (convex hull of dropoff locations). +SELECT + c.c_custkey, + c.c_name AS customer_name, + DATE_TRUNC('month', t.t_pickuptime) AS pickup_month, + ST_Area( + ST_ConvexHull(ST_Collect(ST_GeomFromWKB(t.t_dropoffloc))) + ) AS monthly_travel_hull_area, + COUNT(*) as dropoff_count +FROM trip t +JOIN customer c + ON t.t_custkey = c.c_custkey +GROUP BY c.c_custkey, c.c_name, pickup_month +HAVING dropoff_count > 5 -- Only include repeat customers +ORDER BY monthly_travel_hull_area DESC, c.c_custkey ASC; + +-- Q5: Zone statistics for trips within a 50km radius of the Sedona city center. +SELECT + z.z_zonekey, + z.z_name, + COUNT(t.t_tripkey) AS total_pickups, + AVG(t.t_distance) AS avg_distance, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration +FROM trip t, zone z +WHERE ST_Intersects( + ST_GeomFromText('POLYGON(( + -112.2110 34.4197, -111.3110 34.4197, + -111.3110 35.3197, -112.2110 35.3197, + -112.2110 34.4197 + ))'), -- Bounding box around Sedona + ST_GeomFromWKB(z.z_boundary) + ) + AND ST_Within( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromWKB(z.z_boundary) + ) +GROUP BY z.z_zonekey, z.z_name +ORDER BY total_pickups DESC, z.z_zonekey ASC; + +-- Q6: Detect potential route detours by comparing reported vs. geometric distances. +WITH trip_lengths AS ( + SELECT + t.t_tripkey, + t.t_distance AS reported_distance_m, + ST_Length( + ST_MakeLine( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromWKB(t.t_dropoffloc) + ) + ) * 111111 AS line_distance_m -- Approx. meters per degree + FROM trip t +) +SELECT + t.t_tripkey, + t.reported_distance_m, + t.line_distance_m, + t.reported_distance_m / NULLIF(t.line_distance_m, 0) AS detour_ratio +FROM trip_lengths t +ORDER BY + detour_ratio DESC NULLS LAST, + reported_distance_m DESC, + t_tripkey ASC; + +-- Q7: Count nearby pickups for each building within a 500m radius. +SELECT b.b_buildingkey, b.b_name, COUNT(*) AS nearby_pickup_count +FROM trip t +JOIN building b +ON ST_DWithin(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(b.b_boundary), 0.0045) -- ~500m +GROUP BY b.b_buildingkey, b.b_name +ORDER BY nearby_pickup_count DESC, b.b_buildingkey ASC; + +-- Q8: Building conflation (duplicate/overlap detection via IoU). +WITH b1 AS ( + SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom + FROM building +), +b2 AS ( + SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom + FROM building +), +pairs AS ( + SELECT + b1.id AS building_1, + b2.id AS building_2, + ST_Area(b1.geom) AS area1, + ST_Area(b2.geom) AS area2, + ST_Area(ST_Intersection(b1.geom, b2.geom)) AS overlap_area + FROM b1 + JOIN b2 ON b1.id < b2.id AND ST_Intersects(b1.geom, b2.geom) +) +SELECT + building_1, + building_2, + area1, + area2, + overlap_area, + CASE + WHEN (area1 + area2 - overlap_area) = 0 THEN 1.0 + ELSE overlap_area / (area1 + area2 - overlap_area) + END AS iou +FROM pairs +ORDER BY iou DESC, building_1 ASC, building_2 ASC; + +-- Q9: Zone statistics for trips starting within each zone. +SELECT + z.z_zonekey, + z.z_name AS pickup_zone, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration, + AVG(t.t_distance) AS avg_distance, + COUNT(t.t_tripkey) AS num_trips +FROM + zone z + LEFT JOIN trip t + ON ST_Within( + ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(z.z_boundary) + ) +GROUP BY z.z_zonekey, z.z_name +ORDER BY avg_duration DESC NULLS LAST, z.z_zonekey ASC; + +-- Q10: Count trips that cross between different zones. +SELECT COUNT(*) AS cross_zone_trip_count +FROM + trip t + JOIN zone pickup_zone + ON ST_Within( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromWKB(pickup_zone.z_boundary) + ) + JOIN zone dropoff_zone + ON ST_Within( + ST_GeomFromWKB(t.t_dropoffloc), + ST_GeomFromWKB(dropoff_zone.z_boundary) + ) +WHERE pickup_zone.z_zonekey != dropoff_zone.z_zonekey; + +-- Q11: Find five nearest buildings to each trip pickup location using KNN join. +WITH trip_with_geom AS ( + SELECT + t_tripkey, + t_pickuploc, + ST_GeomFromWKB(t_pickuploc) as pickup_geom + FROM trip +), +building_with_geom AS ( + SELECT + b_buildingkey, + b_name, + b_boundary, + ST_GeomFromWKB(b_boundary) as boundary_geom + FROM building +) +SELECT + t.t_tripkey, + t.t_pickuploc, + b.b_buildingkey, + b.b_name AS building_name, + ST_Distance(t.pickup_geom, b.boundary_geom) AS distance_to_building +FROM trip_with_geom t +JOIN building_with_geom b + ON ST_KNN(t.pickup_geom, b.boundary_geom, 5, FALSE) +ORDER BY t.t_tripkey ASC, distance_to_building ASC, b.b_buildingkey ASC; diff --git a/vortex-bench/src/datasets/mod.rs b/vortex-bench/src/datasets/mod.rs index 3e72ba69e7f..378a5b7b1c1 100644 --- a/vortex-bench/src/datasets/mod.rs +++ b/vortex-bench/src/datasets/mod.rs @@ -69,6 +69,11 @@ pub enum BenchmarkDataset { ClickBench { flavor: Flavor }, #[serde(rename = "public-bi")] PublicBi { name: String }, + #[serde(rename = "spatialbench")] + SpatialBench { + scale_factor: String, + native_points: bool, + }, #[serde(rename = "statpopgen")] StatPopGen { n_rows: u64 }, #[serde(rename = "polarsignals")] @@ -87,6 +92,7 @@ impl BenchmarkDataset { BenchmarkDataset::TpcDS { .. } => "tpcds", BenchmarkDataset::ClickBench { .. } => "clickbench", BenchmarkDataset::PublicBi { .. } => "public-bi", + BenchmarkDataset::SpatialBench { .. } => "spatialbench", BenchmarkDataset::StatPopGen { .. } => "statpopgen", BenchmarkDataset::PolarSignals { .. } => "polarsignals", BenchmarkDataset::Fineweb => "fineweb", @@ -106,6 +112,17 @@ impl Display for BenchmarkDataset { Flavor::Single => write!(f, "clickbench-single"), }, BenchmarkDataset::PublicBi { name } => write!(f, "public-bi({name})"), + BenchmarkDataset::SpatialBench { + scale_factor, + native_points, + } => { + let points = if *native_points { + ", points=native" + } else { + "" + }; + write!(f, "spatialbench(sf={scale_factor}{points})") + } BenchmarkDataset::StatPopGen { n_rows } => write!(f, "statpopgen(n_rows={n_rows})"), BenchmarkDataset::PolarSignals { n_rows } => { write!(f, "polarsignals(n_rows={n_rows})") @@ -163,6 +180,7 @@ impl BenchmarkDataset { "supplier", ], BenchmarkDataset::ClickBench { .. } | BenchmarkDataset::PublicBi { .. } => todo!(), + BenchmarkDataset::SpatialBench { .. } => &["trip"], BenchmarkDataset::StatPopGen { .. } => &["statpopgen"], BenchmarkDataset::PolarSignals { .. } => &["stacktraces"], BenchmarkDataset::Fineweb => &["fineweb"], diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index 30ff45c97a8..b131906d85f 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -34,6 +34,8 @@ use vortex::file::VortexWriteOptions; use vortex::file::WriteStrategyBuilder; use vortex::utils::aliases::hash_map::HashMap; +use crate::spatialbench::SpatialBenchBenchmark; + pub mod appian; pub mod benchmark; pub mod clickbench; @@ -51,6 +53,7 @@ pub mod public_bi; pub mod random_access; pub mod realnest; pub mod runner; +pub mod spatialbench; pub mod statpopgen; pub mod tpcds; pub mod tpch; @@ -72,8 +75,11 @@ use vortex::session::VortexSession; #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; -pub static SESSION: LazyLock = - LazyLock::new(|| VortexSession::default().with_tokio()); +pub static SESSION: LazyLock = LazyLock::new(|| { + let session = VortexSession::default().with_tokio(); + vortex_geo::initialize(&session); + session +}); #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)] pub struct Target { @@ -265,6 +271,8 @@ pub enum BenchmarkArg { PolarSignals, #[clap(name = "public-bi")] PublicBi, + #[clap(name = "spatialbench")] + SpatialBench, } /// Default scale factor for TPC-related benchmarks @@ -326,6 +334,21 @@ pub fn create_benchmark(b: BenchmarkArg, opts: &Opts) -> anyhow::Result { + let scale_factor = opts.get(SCALE_FACTOR_KEY).unwrap_or(DEFAULT_SCALE_FACTOR); + let remote_data_dir = opts.get_as::(REMOTE_DATA_KEY); + let native_points = match opts.get("points") { + None | Some("wkb") => false, + Some("native") => true, + Some(other) => bail!("unknown points option {other:?}, expected wkb or native"), + }; + let benchmark = SpatialBenchBenchmark::new( + scale_factor.to_string(), + remote_data_dir, + native_points, + )?; + Ok(Box::new(benchmark) as _) + } } } diff --git a/vortex-bench/src/spatialbench/benchmark.rs b/vortex-bench/src/spatialbench/benchmark.rs new file mode 100644 index 00000000000..e0c4fa97e3c --- /dev/null +++ b/vortex-bench/src/spatialbench/benchmark.rs @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench benchmark implementation + +use std::fs; + +use url::Url; + +use crate::Benchmark; +use crate::BenchmarkDataset; +use crate::Format; +use crate::TableSpec; +use crate::spatialbench::datagen; +use crate::spatialbench::datagen::Table; +use crate::utils::file::resolve_data_url; +use crate::workspace_root; + +/// Directory under the benchmark data dir holding the native-Point vortex files. +pub const NATIVE_DIR: &str = "vortex-native"; + +/// Directory under the benchmark data dir holding the native-Point GeoParquet files. +pub const PARQUET_NATIVE_DIR: &str = "parquet-native"; + +/// Temporary allowlist of the queries wired up to run (0-based, in `spatialbench.sql` order). The +/// file holds the full SpatialBench suite, but the rest need dimension tables and geo functions +/// that are not registered yet. Remove this once the whole suite is supported and run them all. +const SUPPORTED_QUERIES: &[usize] = &[0]; + +/// SpatialBench geospatial analytics benchmark (Apache Sedona). +/// +/// A ride-sharing workload: a `trip` fact table of WKB point locations plus polygon dimension +/// tables, with spatial-predicate, KNN, and join queries. See +/// . +/// +/// Only Q0 and the `trip` table it reads are wired up so far; dimension tables come with later +/// queries. +pub struct SpatialBenchBenchmark { + pub scale_factor: String, + pub data_url: Url, + /// Store geometry as the native Point extension instead of WKB (`--opt points=native`): the + /// vortex/parquet formats read the native-Point files and queries skip `ST_GeomFromWKB`. + pub native_points: bool, +} + +impl SpatialBenchBenchmark { + pub fn new( + scale_factor: String, + use_remote_data_dir: Option, + native_points: bool, + ) -> anyhow::Result { + Ok(Self { + data_url: resolve_data_url( + use_remote_data_dir.as_deref(), + &format!("spatialbench/{scale_factor}"), + )?, + scale_factor, + native_points, + }) + } +} + +#[async_trait::async_trait] +impl Benchmark for SpatialBenchBenchmark { + fn queries(&self) -> anyhow::Result> { + // Queries are in the WKB (canonical SpatialBench) dialect; for `points=native` the + // `ST_GeomFromWKB(..)` wrappers are stripped. Statements are `;`-separated and numbered + // 0-based in file order; only `SUPPORTED_QUERIES` are returned to the runner. + let queries_file = workspace_root() + .join("vortex-bench") + .join("spatialbench") + .with_extension("sql"); + let contents = fs::read_to_string(queries_file)?; + let contents = if self.native_points { + strip_wkb_wrappers(&contents) + } else { + contents + }; + Ok(contents + .split_terminator(';') + .map(str::trim) + .map(str::to_string) + .enumerate() + .filter(|(idx, _)| SUPPORTED_QUERIES.contains(idx)) + .collect()) + } + + async fn generate_base_data(&self) -> anyhow::Result<()> { + if self.data_url.scheme() != "file" { + return Ok(()); + } + + let base_data_dir = self + .data_url + .to_file_path() + .map_err(|_| anyhow::anyhow!("Invalid file URL: {}", self.data_url.as_str()))?; + + datagen::generate_tables(&self.scale_factor, base_data_dir.clone()).await?; + + if self.native_points { + let parquet_dir = base_data_dir.join(Format::Parquet.name()); + datagen::write_native_vortex( + Table::Trip, + &parquet_dir, + &base_data_dir.join(NATIVE_DIR), + ) + .await?; + datagen::write_native_parquet( + Table::Trip, + &parquet_dir, + &base_data_dir.join(PARQUET_NATIVE_DIR), + ) + .await?; + } + Ok(()) + } + + fn format_path(&self, format: Format, base_url: &Url) -> anyhow::Result { + if self.native_points { + // points=native serves the pre-converted native-Point dirs: vortex (Point extension, + // GeoDistance pushdown) and parquet (GeoParquet geodatafusion reads as geometry). + // Other formats would feed WKB to native-variant SQL, so fail fast. + let dir = match format { + Format::OnDiskVortex => NATIVE_DIR, + Format::Parquet => PARQUET_NATIVE_DIR, + other => anyhow::bail!( + "points=native only supports the vortex and parquet formats, got {other}" + ), + }; + return Ok(base_url.join(&format!("{dir}/"))?); + } + Ok(base_url.join(&format!("{}/", format.name()))?) + } + + fn expected_row_counts(&self) -> Option> { + // 0-based by query index: Q0's count sits at index 0; counts cross-checked against an + // independent brute-force WKB decode. + match self.scale_factor.as_str() { + "0.1" => Some(vec![6]), + "1.0" => Some(vec![94]), + _ => None, + } + } + + fn dataset(&self) -> BenchmarkDataset { + BenchmarkDataset::SpatialBench { + scale_factor: self.scale_factor.clone(), + native_points: self.native_points, + } + } + + fn dataset_name(&self) -> &str { + "spatialbench" + } + + fn dataset_display(&self) -> String { + format!("spatialbench(sf={})", self.scale_factor) + } + + fn data_url(&self) -> &Url { + &self.data_url + } + + fn table_specs(&self) -> Vec { + vec![TableSpec::new("trip", None)] + } +} + +/// Rewrite a WKB-dialect query for the native-Point encoding by dropping each +/// `ST_GeomFromWKB(col)` wrapper down to `col` -- the native columns are already geometries. +fn strip_wkb_wrappers(sql: &str) -> String { + const OPEN: &str = "ST_GeomFromWKB("; + let mut out = String::with_capacity(sql.len()); + let mut rest = sql; + while let Some(pos) = rest.find(OPEN) { + out.push_str(&rest[..pos]); + let after = &rest[pos + OPEN.len()..]; + match after.find(')') { + Some(close) => { + out.push_str(&after[..close]); + rest = &after[close + 1..]; + } + // Unbalanced wrapper: emit it verbatim and stop rewriting. + None => { + out.push_str(OPEN); + rest = after; + } + } + } + out.push_str(rest); + out +} diff --git a/vortex-bench/src/spatialbench/datagen/mod.rs b/vortex-bench/src/spatialbench/datagen/mod.rs new file mode 100644 index 00000000000..671b87663a4 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/mod.rs @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench data preparation. [`wkb`] generates the canonical WKB base tables; [`native`] +//! derives the native-Point encodings from them for `points=native`. The [`table`] catalog is the +//! single source of truth for the base tables both stages share. + +pub mod native; +pub mod table; +pub mod wkb; + +pub use native::write_native_parquet; +pub use native::write_native_vortex; +pub use table::Table; +pub use wkb::generate_tables; diff --git a/vortex-bench/src/spatialbench/datagen/native.rs b/vortex-bench/src/spatialbench/datagen/native.rs new file mode 100644 index 00000000000..d9fbbb8f005 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/native.rs @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Native-geometry preparation for `points=native`: decode each table's WKB geometry columns to +//! native GeoArrow types in Arrow land (`geoarrow_cast`, so Vortex never decodes WKB), then write +//! them as a native Vortex file and a GeoParquet file. The decode is a one-time data-prep cost. + +use std::path::Path; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Context; +use arrow_array::RecordBatch; +use arrow_schema::Schema; +use futures::TryStreamExt; +use geoarrow::array::GenericWkbArray; +use geoarrow::array::GeoArrowArray; +use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::Crs; +use geoarrow::datatypes::Dimension; +use geoarrow::datatypes::GeoArrowType; +use geoarrow::datatypes::Metadata; +use geoarrow::datatypes::PointType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; +use parquet::arrow::AsyncArrowWriter; +use parquet::arrow::ParquetRecordBatchStreamBuilder; +use parquet::arrow::ProjectionMask; +use parquet::basic::Compression; +use parquet::file::properties::WriterProperties; +use tokio::fs::File as TokioFile; +use vortex::array::ArrayRef; +use vortex::array::IntoArray; +use vortex::array::arrays::ChunkedArray; +use vortex::array::arrow::ArrowSessionExt; +use vortex::file::WriteOptionsSessionExt; + +use super::table::GeometryKind; +use super::table::Table; +use crate::SESSION; +use crate::utils::file::idempotent_async; + +/// EPSG:4326, the CRS the benchmark data and queries assume. +fn epsg_4326() -> Arc { + Arc::new(Metadata::new( + Crs::from_unknown_crs_type("EPSG:4326".to_string()), + None, + )) +} + +/// The native GeoArrow type for `kind`, separated-XY in EPSG:4326. +fn geoarrow_type(kind: GeometryKind) -> GeoArrowType { + match kind { + GeometryKind::Point => GeoArrowType::Point( + PointType::new(Dimension::XY, epsg_4326()).with_coord_type(CoordType::Separated), + ), + } +} + +/// Write `{native_dir}/{table}_0.vortex` with native geometry columns from the WKB parquet. Idempotent. +pub async fn write_native_vortex( + table: Table, + parquet_dir: &Path, + native_dir: &Path, +) -> anyhow::Result { + idempotent_async( + native_dir.join(format!("{}_0.vortex", table.name())), + |path| async move { + let chunks = map_source_batches(parquet_dir, table, |b| native_chunk(b, table)).await?; + + let dtype = chunks[0].dtype().clone(); + let chunked = ChunkedArray::try_new(chunks, dtype)?.into_array(); + let mut file = TokioFile::create(&path).await?; + SESSION + .write_options() + .write(&mut file, chunked.to_array_stream()) + .await?; + tracing::info!(path = %path.display(), table = table.name(), "wrote native geometry table"); + Ok(()) + }, + ) + .await +} + +/// Write `{out_dir}/{table}_0.parquet` with native GeoArrow geometry columns (separated XY, +/// `geoarrow.*` field metadata so geodatafusion reads them as geometries). Idempotent. +pub async fn write_native_parquet( + table: Table, + parquet_dir: &Path, + out_dir: &Path, +) -> anyhow::Result { + idempotent_async( + out_dir.join(format!("{}_0.parquet", table.name())), + |path| async move { + let batches = + map_source_batches(parquet_dir, table, |b| native_record_batch(b, table)).await?; + + let schema = batches.first().context("no batches to write")?.schema(); + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let mut writer = + AsyncArrowWriter::try_new(TokioFile::create(&path).await?, schema, Some(props))?; + for batch in &batches { + writer.write(batch).await?; + } + writer.close().await?; + tracing::info!(path = %path.display(), table = table.name(), "wrote native geometry parquet table"); + Ok(()) + }, + ) + .await +} + +/// Apply `f` to every batch read from `table`'s base WKB parquet parts, projected to its columns. +async fn map_source_batches( + parquet_dir: &Path, + table: Table, + mut f: impl FnMut(RecordBatch) -> anyhow::Result, +) -> anyhow::Result> { + let pattern = parquet_dir.join(format!("{}_*.parquet", table.name())); + let mut files: Vec = + glob::glob(&pattern.to_string_lossy())?.collect::>()?; + files.sort(); + anyhow::ensure!(!files.is_empty(), "no parquet matching {pattern:?}"); + + let mut out = Vec::new(); + for file in files { + let builder = ParquetRecordBatchStreamBuilder::new(TokioFile::open(&file).await?).await?; + let mask = + ProjectionMask::columns(builder.parquet_schema(), table.columns().iter().copied()); + let mut stream = builder.with_projection(mask).build()?; + while let Some(batch) = stream.try_next().await? { + out.push(f(batch)?); + } + } + Ok(out) +} + +/// Decode each of `table`'s geometry columns from WKB to its native GeoArrow type, swapping the +/// column in so the field carries the matching `geoarrow.*` extension metadata. +fn native_record_batch(batch: RecordBatch, table: Table) -> anyhow::Result { + let schema = batch.schema(); + let mut fields = schema.fields().to_vec(); + let mut columns = batch.columns().to_vec(); + + for geom in table.geometry_columns() { + let idx = schema.index_of(geom.name)?; + let geo_type = geoarrow_type(geom.kind); + let wkb = GenericWkbArray::::try_from(( + batch.column(idx).as_ref(), + WkbType::new(epsg_4326()), + )) + .map_err(|e| anyhow::anyhow!("wrapping WKB column {}: {e}", geom.name))?; + columns[idx] = cast(&wkb, &geo_type) + .map_err(|e| anyhow::anyhow!("parsing WKB column {}: {e}", geom.name))? + .to_array_ref(); + fields[idx] = Arc::new(geo_type.to_field(geom.name, false)); + } + + Ok(RecordBatch::try_new( + Arc::new(Schema::new(fields)), + columns, + )?) +} + +/// Convert a WKB batch to a Vortex struct chunk with `table`'s geometry columns as native types. +fn native_chunk(batch: RecordBatch, table: Table) -> anyhow::Result { + let native_batch = native_record_batch(batch, table)?; + let native_schema = native_batch.schema(); + SESSION + .arrow() + .from_arrow_record_batch(native_batch, &native_schema) + .context("importing native batch") +} diff --git a/vortex-bench/src/spatialbench/datagen/table.rs b/vortex-bench/src/spatialbench/datagen/table.rs new file mode 100644 index 00000000000..3629aa878a7 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/table.rs @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! The shared SpatialBench table catalog: one source of truth for the base tables, used by both +//! the WKB generation ([`super::wkb`]) and the native geometry conversion ([`super::native`]). + +/// A SpatialBench base table. +#[derive(Clone, Copy)] +pub enum Table { + Trip, +} + +/// Every base table. WKB generation emits all of them; native conversion handles those with +/// geometry columns. +pub(crate) const TABLES: &[Table] = &[Table::Trip]; + +/// A geometry column and the geometry type its WKB bytes decode to. +pub(crate) struct GeometryColumn { + pub(crate) name: &'static str, + pub(crate) kind: GeometryKind, +} + +/// Geometry types a column can hold. Add a variant (and the matching arm in [`super::native`]) as +/// tables with new geometry types are wired. +#[derive(Clone, Copy, Debug)] +pub(crate) enum GeometryKind { + Point, +} + +impl Table { + /// File stem under a format directory, e.g. `Trip` → `trip_{part}.parquet`. + pub(crate) fn name(self) -> &'static str { + match self { + Table::Trip => "trip", + } + } + + /// Columns the wired queries read — the projection applied when building native files. + pub(crate) fn columns(self) -> &'static [&'static str] { + match self { + Table::Trip => &["t_tripkey", "t_pickuptime", "t_pickuploc"], + } + } + + /// Geometry columns to decode from WKB to native, with their geometry type. + pub(crate) fn geometry_columns(self) -> &'static [GeometryColumn] { + match self { + Table::Trip => &[GeometryColumn { + name: "t_pickuploc", + kind: GeometryKind::Point, + }], + } + } +} diff --git a/vortex-bench/src/spatialbench/datagen/wkb.rs b/vortex-bench/src/spatialbench/datagen/wkb.rs new file mode 100644 index 00000000000..4a1e1001510 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/wkb.rs @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench WKB base-table generation via the `spatialbench` crates (a tpchgen-rs fork). +//! Geometry is emitted as WKB; the native-Point encodings derive from these files in +//! [`super::native`]. + +use std::fs; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Result; +// spatialbench emits arrow-56 batches, so they must be written with its matching arrow-56 +// parquet crate, not the workspace's arrow-58 one. The parquet file itself is version-neutral. +use spatialbench::generators::TripGenerator; +use spatialbench_arrow::RecordBatchIterator; +use spatialbench_arrow::TripArrow; +use spatialbench_parquet::arrow::AsyncArrowWriter; +use spatialbench_parquet::basic::Compression; +use spatialbench_parquet::file::properties::WriterProperties; +use tokio::fs::File as TokioFile; +use tracing::info; + +use super::table::TABLES; +use super::table::Table; +use crate::Format; +use crate::utils::file::idempotent_async; + +/// Batch size matching the TPC-H generator's streaming batches. +const BATCH_SIZE: usize = 8192 * 64; + +/// Batch iterator for one partition of `table`, from the arrow-56 `spatialbench` crates. +fn iterator( + table: Table, + scale_factor: f64, + part: i32, + part_count: i32, +) -> Box { + match table { + Table::Trip => Box::new( + TripArrow::new(TripGenerator::new(scale_factor, part, part_count)) + .with_batch_size(BATCH_SIZE), + ), + } +} + +/// Generate the SpatialBench base tables as parquet under `{output_dir}/parquet/`. +pub async fn generate_tables(scale_factor: &str, output_dir: PathBuf) -> Result<()> { + let scale_factor = scale_factor.parse::()?; + let parquet_dir = output_dir.join(Format::Parquet.name()); + fs::create_dir_all(&parquet_dir)?; + + // One part per unit of scale factor keeps each file near the ~350MB the trip generator + // produces at SF1. + #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let num_parts = (scale_factor.ceil() as usize).max(1); + let part_count = i32::try_from(num_parts)?; + + for &table in TABLES { + for part_idx in 0..num_parts { + let output_file = parquet_dir.join(format!("{}_{part_idx}.parquet", table.name())); + let part = i32::try_from(part_idx + 1)?; + + idempotent_async(output_file.to_string_lossy().as_ref(), |path| async move { + info!( + scale_factor, + part, + part_count, + table = table.name(), + "Generating SpatialBench table" + ); + + let iter = iterator(table, scale_factor, part, part_count); + let schema = Arc::clone(iter.schema()); + + let file = TokioFile::create(&path).await?; + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let mut writer = AsyncArrowWriter::try_new(file, schema, Some(props))?; + for batch in iter { + writer.write(&batch).await?; + } + writer.close().await?; + + Ok::<(), anyhow::Error>(()) + }) + .await?; + } + } + + Ok(()) +} diff --git a/vortex-bench/src/spatialbench/mod.rs b/vortex-bench/src/spatialbench/mod.rs new file mode 100644 index 00000000000..bba06bd7ef9 --- /dev/null +++ b/vortex-bench/src/spatialbench/mod.rs @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench geospatial analytics benchmark. +//! +//! See . + +pub mod benchmark; +pub mod datagen; + +pub use benchmark::SpatialBenchBenchmark; diff --git a/vortex-bench/src/v3.rs b/vortex-bench/src/v3.rs index 48e8a7f1c94..7ac1d72365b 100644 --- a/vortex-bench/src/v3.rs +++ b/vortex-bench/src/v3.rs @@ -294,6 +294,7 @@ fn canonical_tpc_scale_factor(scale_factor: &str) -> String { /// | `GhArchive` | `gharchive` | `None` | `None` | | /// | `Appian` | `appian` | `None` | `None` | Static dataset; no scale factor. | /// | `PublicBi { name }` | `public-bi` | dataset name (e.g. `cms-provider`) | `None` | Sub-dataset name lives in `dataset_variant`. | +/// | `SpatialBench { scale_factor, native_points }` | `spatialbench` | `points-native` when native, else `None` | SF as string | Same canonicalization as TPC-H; no historical v2 records to merge with. | pub fn benchmark_dataset_dims(d: &BenchmarkDataset) -> (String, Option, Option) { match d { BenchmarkDataset::TpcH { scale_factor } => ( @@ -318,6 +319,14 @@ pub fn benchmark_dataset_dims(d: &BenchmarkDataset) -> (String, Option, // live). Drop it to keep live ingests merging into the migrated // group. The dataset-level `n_rows` is recoverable from the bench // matrix if ever needed. + BenchmarkDataset::SpatialBench { + scale_factor, + native_points, + } => ( + "spatialbench".to_string(), + native_points.then(|| "points-native".to_string()), + Some(canonical_tpc_scale_factor(scale_factor)), + ), BenchmarkDataset::StatPopGen { .. } => ("statpopgen".to_string(), None, None), BenchmarkDataset::PolarSignals { .. } => ("polarsignals".to_string(), None, None), BenchmarkDataset::Fineweb => ("fineweb".to_string(), None, None),