diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 6d6f44370..e505ca701 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -63,9 +63,31 @@ jobs: git submodule foreach 'git fetch --unshallow || true' ./gradlew build --rerun-tasks + integration: + name: Integration tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + with: + submodules: recursive + - name: Set up JDK 17 + uses: actions/setup-java@v5 + with: + java-version: '17' + distribution: 'temurin' + - name: Setup Gradle + uses: gradle/actions/setup-gradle@v6 + - name: Build with Gradle + run: | + # fetch submodule tags since actions/checkout@v6 does not + git submodule foreach 'git fetch --unshallow || true' + + ./gradlew integrationTest isthmus-native-image-mac-linux: name: Build Isthmus Native Image - needs: java + needs: + - java + - integration runs-on: ${{ matrix.os }} strategy: matrix: diff --git a/gradle.properties b/gradle.properties index 783346088..911ca826f 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,4 +1,4 @@ -org.gradle.jvmargs=-XX:+UseG1GC -Xmx1g -XX:MaxMetaspaceSize=512m --add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED \ +org.gradle.jvmargs=-XX:+UseG1GC -Xmx2g -XX:MaxMetaspaceSize=512m --add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED \ diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index db9d7d6b1..e1a532338 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -13,6 +13,7 @@ jspecify = "1.0.0" junit = "6.1.0" nmcp = "1.5.0" picocli = "4.7.7" +postgresql = "42.7.11" protobuf-plugin = "0.10.0" protobuf = "3.25.9" scala-2-12 = "2.12.20" @@ -27,6 +28,7 @@ spark-3-4 = "3.4.4" spark-3-5 = "3.5.4" spark-4-0 = "4.0.2" spotless = "8.6.0" +testcontainers = "2.0.5" validator = "3.0.3" [libraries] @@ -57,6 +59,7 @@ junit-platform-launcher = { module = "org.junit.platform:junit-platform-launcher junit-platform-engine = { module = "org.junit.platform:junit-platform-engine" } picocli = { module = "info.picocli:picocli", version.ref = "picocli" } picocli-codegen = { module = "info.picocli:picocli-codegen", version.ref = "picocli" } +postgresql-jdbc = { module = "org.postgresql:postgresql", version.ref = "postgresql" } protobuf-java = { module = "com.google.protobuf:protobuf-java", version.ref = "protobuf" } protobuf-java-util = { module = "com.google.protobuf:protobuf-java-util", version.ref = "protobuf" } protoc = { module = "com.google.protobuf:protoc", version.ref = "protobuf" } @@ -80,9 +83,13 @@ spark-catalyst-4-0-2-13 = { module = "org.apache.spark:spark-catalyst_2.13", ver spark-core-4-0-2-13 = { module = "org.apache.spark:spark-core_2.13", version.ref = "spark-4-0" } spark-hive-4-0-2-13 = { module = "org.apache.spark:spark-hive_2.13", version.ref = "spark-4-0" } spark-sql-4-0-2-13 = { module = "org.apache.spark:spark-sql_2.13", version.ref = "spark-4-0" } +testcontainers = { module = "org.testcontainers:testcontainers", version.ref = "testcontainers" } +testcontainers-junit5 = { module = "org.testcontainers:testcontainers-junit-jupiter", version.ref = "testcontainers" } +testcontainers-postgres = { module = "org.testcontainers:testcontainers-postgresql", version.ref = "testcontainers" } [bundles] jackson = [ "jackson-databind", "jackson-annotations", "jackson-datatype-jdk8", "jackson-dataformat-yaml" ] +testcontainers = [ "testcontainers", "testcontainers-junit5", "testcontainers-postgres" ] [plugins] graal = { id = "org.graalvm.buildtools.native", version.ref = "graal-plugin" } diff --git a/isthmus/build.gradle.kts b/isthmus/build.gradle.kts index 067b62873..0f7a88798 100644 --- a/isthmus/build.gradle.kts +++ b/isthmus/build.gradle.kts @@ -118,6 +118,12 @@ dependencies { } testImplementation(libs.protobuf.java) api(libs.jspecify) + + testImplementation(libs.bundles.testcontainers) + + testImplementation(libs.postgresql.jdbc) + + testImplementation(libs.slf4j.jdk14) } tasks { @@ -139,6 +145,23 @@ tasks { // Only set the compile release since JUnit 6 requires Java 17 to run tests. compileJava { options.release = 11 } + + test { + // Exclude integration tests by default + useJUnitPlatform { excludeTags("integration") } + } +} + +// Register a separate task to run integration tests +val test by testing.suites.existing(JvmTestSuite::class) + +tasks.register("integrationTest") { + description = "Run integration tests" + group = "verification" + testClassesDirs = files(test.map { it.sources.output.classesDirs }) + classpath = files(test.map { it.sources.runtimeClasspath }) + useJUnitPlatform { includeTags("integration") } + shouldRunAfter(tasks.test) } sourceSets { test { proto.srcDirs("src/test/resources/extensions") } } diff --git a/isthmus/src/test/java/io/substrait/isthmus/integration/PostgreSqlIntegrationTest.java b/isthmus/src/test/java/io/substrait/isthmus/integration/PostgreSqlIntegrationTest.java new file mode 100644 index 000000000..d578b0db9 --- /dev/null +++ b/isthmus/src/test/java/io/substrait/isthmus/integration/PostgreSqlIntegrationTest.java @@ -0,0 +1,138 @@ +package io.substrait.isthmus.integration; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import io.substrait.isthmus.ConverterProvider; +import io.substrait.isthmus.PlanTestBase; +import io.substrait.isthmus.SqlToSubstrait; +import io.substrait.isthmus.SubstraitToSql; +import io.substrait.plan.Plan; +import java.io.IOException; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import org.apache.calcite.sql.dialect.PostgresqlSqlDialect; +import org.apache.calcite.sql.parser.SqlParseException; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.BindMode; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.JdbcDatabaseContainer.NoDriverFoundException; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.postgresql.PostgreSQLContainer; +import org.testcontainers.utility.DockerImageName; + +@Tag("integration") +@Testcontainers +class PostgreSqlIntegrationTest extends PlanTestBase { + private static final Logger LOG = LoggerFactory.getLogger(PostgreSqlIntegrationTest.class); + + // TODO: These queries produce different results when generated from Substrait + private static final List EXCLUDED_QUERIES = List.of(14); + + private static final DockerImageName UV_IMAGE = + DockerImageName.parse("ghcr.io/astral-sh/uv:python3.14-trixie-slim"); + private static final DockerImageName POSTGRES_IMAGE = DockerImageName.parse("postgres:18"); + + private static final String TPCH_DATA_HOST_PATH = "tpch/data"; + private static final String TPCH_DATA_CONTAINER_PATH = "/tmp/tpc-h"; + private static final String TPCH_INIT_HOST_PATH = "tpch/postgresql/tpch_init.sql"; + + private static final List TPCHGEN_ARGS = + List.of( + "--scale-factor", "0.001", "--format", "csv", "--output-dir", TPCH_DATA_CONTAINER_PATH); + + private static final List TPCHGEN_CMD = + Stream.concat( + Stream.of("uvx", "--from", "tpchgen-cli == 2.*", "tpchgen-cli"), + TPCHGEN_ARGS.stream()) + .collect(Collectors.toList()); + + /** Create TPC-H test data. */ + @Container + @SuppressWarnings("resource") + private static final GenericContainer tpchgen = + new GenericContainer<>(UV_IMAGE) + .withClasspathResourceMapping( + TPCH_DATA_HOST_PATH, TPCH_DATA_CONTAINER_PATH, BindMode.READ_WRITE) + .withCommand(TPCHGEN_CMD.toArray(new String[0])) + .withStartupCheckStrategy(new SuccessfulExitCheckStrategy()); + + /** PostgreSQL instance shared across all test methods in this class. */ + @Container + @SuppressWarnings("resource") + private static final PostgreSQLContainer postgres = + new PostgreSQLContainer(POSTGRES_IMAGE) + .dependsOn(tpchgen) + .withClasspathResourceMapping( + TPCH_DATA_HOST_PATH, TPCH_DATA_CONTAINER_PATH, BindMode.READ_ONLY) + .withClasspathResourceMapping( + TPCH_INIT_HOST_PATH, "/docker-entrypoint-initdb.d/tpch_init.sql", BindMode.READ_ONLY); + + private static final String COMPARE_RESULTS_SQL_TEMPLATE = + """ + WITH expected AS (%s), + actual AS (%s) + SELECT count(*) FROM ( + SELECT * FROM + (SELECT * FROM expected EXCEPT SELECT * FROM actual) + UNION (SELECT * FROM actual EXCEPT SELECT * FROM expected) + ) + """; + + static IntStream tpcHTestCases() { + return IntStream.rangeClosed(1, 22).filter(i -> !EXCLUDED_QUERIES.contains(i)); + } + + @ParameterizedTest + @MethodSource("tpcHTestCases") + void testTpcH(final int queryNo) + throws NoDriverFoundException, SQLException, IOException, SqlParseException { + + final String inputSql = asString(String.format("tpch/queries/%02d.sql", queryNo)); + final SqlToSubstrait sqlToSubstrait = new SqlToSubstrait(); + final Plan plan = sqlToSubstrait.convert(inputSql, TPCH_CATALOG); + + final ConverterProvider provider = new ConverterProvider(extensions); + final SubstraitToSql substraitToSql = new SubstraitToSql(provider); + + final String generatedSql = substraitToSql.convert(plan, PostgresqlSqlDialect.DEFAULT).get(0); + + final String referenceSql = asString(String.format("tpch/postgresql/%02d.sql", queryNo)); + + final String compareSql = + String.format(COMPARE_RESULTS_SQL_TEMPLATE, referenceSql, generatedSql); + + LOG.atDebug().log(compareSql); + + try (Connection conn = postgres.createConnection(""); + Statement stmt = conn.createStatement(); + ResultSet result = stmt.executeQuery(compareSql); ) { + // we expect exactly one row + assertTrue(result.next()); + + // the count should be zero if both the reference and generated SQL produce the same results + int differenceCount = result.getInt(1); + assertEquals( + 0, + differenceCount, + String.format( + "Reference and generated SQL produce %d different results.\n\nReference SQL:\n%s\n\nGenerated SQL:\n%s", + differenceCount, referenceSql, generatedSql)); + + // we expect exactly one row + assertFalse(result.next()); + } + } +} diff --git a/isthmus/src/test/java/io/substrait/isthmus/integration/SuccessfulExitCheckStrategy.java b/isthmus/src/test/java/io/substrait/isthmus/integration/SuccessfulExitCheckStrategy.java new file mode 100644 index 000000000..2f8d02e10 --- /dev/null +++ b/isthmus/src/test/java/io/substrait/isthmus/integration/SuccessfulExitCheckStrategy.java @@ -0,0 +1,28 @@ +package io.substrait.isthmus.integration; + +import com.github.dockerjava.api.DockerClient; +import com.github.dockerjava.api.command.InspectContainerResponse; +import org.testcontainers.containers.startupcheck.StartupCheckStrategy; +import org.testcontainers.utility.DockerStatus; + +/** + * A {@link StartupCheckStrategy} that checks if the container has exited with a successful exit + * code. This allows use of a container that is launched purely to execute a command prior to + * startup of dependent containers. + */ +public class SuccessfulExitCheckStrategy extends StartupCheckStrategy { + @Override + public StartupStatus checkStartupState(DockerClient dockerClient, String containerId) { + InspectContainerResponse.ContainerState state = getCurrentState(dockerClient, containerId); + + if (!DockerStatus.isContainerStopped(state)) { + return StartupStatus.NOT_YET_KNOWN; + } + + if (!DockerStatus.isContainerExitCodeSuccess(state)) { + return StartupStatus.FAILED; + } + + return StartupStatus.SUCCESSFUL; + } +} diff --git a/isthmus/src/test/resources/logging.properties b/isthmus/src/test/resources/logging.properties new file mode 100644 index 000000000..f31a000ed --- /dev/null +++ b/isthmus/src/test/resources/logging.properties @@ -0,0 +1,2 @@ +# Root logger level - set to INFO for test visibility +.level=INFO diff --git a/isthmus/src/test/resources/tpch/data/.gitignore b/isthmus/src/test/resources/tpch/data/.gitignore new file mode 100644 index 000000000..3c14738cf --- /dev/null +++ b/isthmus/src/test/resources/tpch/data/.gitignore @@ -0,0 +1,5 @@ +# TPC-H test data is generated to this directory. + +*.tbl +*.csv +*.parquet diff --git a/isthmus/src/test/resources/tpch/postgresql/01.sql b/isthmus/src/test/resources/tpch/postgresql/01.sql new file mode 100644 index 000000000..eec08fe6f --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/01.sql @@ -0,0 +1,22 @@ +select + "L_RETURNFLAG", + "L_LINESTATUS", + sum("L_QUANTITY") as "SUM_QTY", + sum("L_EXTENDEDPRICE") as "SUM_BASE_PRICE", + sum("L_EXTENDEDPRICE" * (1 - "L_DISCOUNT")) as "SUM_DISC_PRICE", + sum("L_EXTENDEDPRICE" * (1 - "L_DISCOUNT") * (1 + "L_TAX")) as "SUM_CHARGE", + avg("L_QUANTITY") as "AVG_QTY", + avg("L_EXTENDEDPRICE") as "AVG_PRICE", + avg("L_DISCOUNT") as "AVG_DISC", + count(*) as "COUNT_ORDER" +from + "LINEITEM" +where + "L_SHIPDATE" <= date '1998-12-01' - interval '120 days' +group by + "L_RETURNFLAG", + "L_LINESTATUS" + +order by + "L_RETURNFLAG", + "L_LINESTATUS" diff --git a/isthmus/src/test/resources/tpch/postgresql/02.sql b/isthmus/src/test/resources/tpch/postgresql/02.sql new file mode 100644 index 000000000..6247fdff2 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/02.sql @@ -0,0 +1,47 @@ +select + "S"."S_ACCTBAL", + "S"."S_NAME", + "N"."N_NAME", + "P"."P_PARTKEY", + "P"."P_MFGR", + "S"."S_ADDRESS", + "S"."S_PHONE", + "S"."S_COMMENT" +from + "PART" "P", + "SUPPLIER" "S", + "PARTSUPP" "PS", + "NATION" "N", + "REGION" "R" +where + "P"."P_PARTKEY" = "PS"."PS_PARTKEY" + and "S"."S_SUPPKEY" = "PS"."PS_SUPPKEY" + and "P"."P_SIZE" = 41 + and "P"."P_TYPE" like '%NICKEL' + and "S"."S_NATIONKEY" = "N"."N_NATIONKEY" + and "N"."N_REGIONKEY" = "R"."R_REGIONKEY" + and "R"."R_NAME" = 'EUROPE' + and "PS"."PS_SUPPLYCOST" = ( + + select + min("PS"."PS_SUPPLYCOST") + + from + "PARTSUPP" "PS", + "SUPPLIER" "S", + "NATION" "N", + "REGION" "R" + where + "P"."P_PARTKEY" = "PS"."PS_PARTKEY" + and "S"."S_SUPPKEY" = "PS"."PS_SUPPKEY" + and "S"."S_NATIONKEY" = "N"."N_NATIONKEY" + and "N"."N_REGIONKEY" = "R"."R_REGIONKEY" + and "R"."R_NAME" = 'EUROPE' + ) + +order by + "S"."S_ACCTBAL" desc, + "N"."N_NAME", + "S"."S_NAME", + "P"."P_PARTKEY" +limit 100 diff --git a/isthmus/src/test/resources/tpch/postgresql/03.sql b/isthmus/src/test/resources/tpch/postgresql/03.sql new file mode 100644 index 000000000..cfb0b620f --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/03.sql @@ -0,0 +1,26 @@ +select + "L"."L_ORDERKEY", + sum("L"."L_EXTENDEDPRICE" * (1 - "L"."L_DISCOUNT")) as "REVENUE", + "O"."O_ORDERDATE", + "O"."O_SHIPPRIORITY" + +from + "CUSTOMER" "C", + "ORDERS" "O", + "LINEITEM" "L" + +where + "C"."C_MKTSEGMENT" = 'HOUSEHOLD' + and "C"."C_CUSTKEY" = "O"."O_CUSTKEY" + and "L"."L_ORDERKEY" = "O"."O_ORDERKEY" + and "O"."O_ORDERDATE" < date '1995-03-25' + and "L"."L_SHIPDATE" > date '1995-03-25' + +group by + "L"."L_ORDERKEY", + "O"."O_ORDERDATE", + "O"."O_SHIPPRIORITY" +order by + "REVENUE" desc, + "O"."O_ORDERDATE" +limit 10 diff --git a/isthmus/src/test/resources/tpch/postgresql/04.sql b/isthmus/src/test/resources/tpch/postgresql/04.sql new file mode 100644 index 000000000..315fcd754 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/04.sql @@ -0,0 +1,23 @@ +select + "O"."O_ORDERPRIORITY", + count(*) as "ORDER_COUNT" +from + "ORDERS" "O" + +where + "O"."O_ORDERDATE" >= date '1996-10-01' + and "O"."O_ORDERDATE" < date '1996-10-01' + interval '3 months' + and + exists ( + select + * + from + "LINEITEM" "L" + where + "L"."L_ORDERKEY" = "O"."O_ORDERKEY" + and "L"."L_COMMITDATE" < "L"."L_RECEIPTDATE" + ) +group by + "O"."O_ORDERPRIORITY" +order by + "O"."O_ORDERPRIORITY" diff --git a/isthmus/src/test/resources/tpch/postgresql/05.sql b/isthmus/src/test/resources/tpch/postgresql/05.sql new file mode 100644 index 000000000..bfa453029 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/05.sql @@ -0,0 +1,27 @@ +select + "N"."N_NAME", + sum("L"."L_EXTENDEDPRICE" * (1 - "L"."L_DISCOUNT")) as "REVENUE" + +from + "CUSTOMER" "C", + "ORDERS" "O", + "LINEITEM" "L", + "SUPPLIER" "S", + "NATION" "N", + "REGION" "R" + +where + "C"."C_CUSTKEY" = "O"."O_CUSTKEY" + and "L"."L_ORDERKEY" = "O"."O_ORDERKEY" + and "L"."L_SUPPKEY" = "S"."S_SUPPKEY" + and "C"."C_NATIONKEY" = "S"."S_NATIONKEY" + and "S"."S_NATIONKEY" = "N"."N_NATIONKEY" + and "N"."N_REGIONKEY" = "R"."R_REGIONKEY" + and "R"."R_NAME" = 'EUROPE' + and "O"."O_ORDERDATE" >= date '1997-01-01' + and "O"."O_ORDERDATE" < date '1997-01-01' + interval '1 year' +group by + "N"."N_NAME" + +order by + "REVENUE" desc diff --git a/isthmus/src/test/resources/tpch/postgresql/06.sql b/isthmus/src/test/resources/tpch/postgresql/06.sql new file mode 100644 index 000000000..78aaa7803 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/06.sql @@ -0,0 +1,10 @@ +select + sum("L_EXTENDEDPRICE" * "L_DISCOUNT") as "REVENUE" +from + "LINEITEM" +where + "L_SHIPDATE" >= date '1997-01-01' + and "L_SHIPDATE" < date '1997-01-01' + interval '1 year' + and + "L_DISCOUNT" between 0.03 - 0.01 and 0.03 + 0.01 + and "L_QUANTITY" < 24 diff --git a/isthmus/src/test/resources/tpch/postgresql/07.sql b/isthmus/src/test/resources/tpch/postgresql/07.sql new file mode 100644 index 000000000..d186ccee6 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/07.sql @@ -0,0 +1,39 @@ +select + "SUPP_NATION", + "CUST_NATION", + "L_YEAR", + sum("VOLUME") as "REVENUE" +from + ( + select + "N1"."N_NAME" as "SUPP_NATION", + "N2"."N_NAME" as "CUST_NATION", + extract(year from "L"."L_SHIPDATE") as "L_YEAR", + "L"."L_EXTENDEDPRICE" * (1 - "L"."L_DISCOUNT") as "VOLUME" + from + "SUPPLIER" "S", + "LINEITEM" "L", + "ORDERS" "O", + "CUSTOMER" "C", + "NATION" "N1", + "NATION" "N2" + where + "S"."S_SUPPKEY" = "L"."L_SUPPKEY" + and "O"."O_ORDERKEY" = "L"."L_ORDERKEY" + and "C"."C_CUSTKEY" = "O"."O_CUSTKEY" + and "S"."S_NATIONKEY" = "N1"."N_NATIONKEY" + and "C"."C_NATIONKEY" = "N2"."N_NATIONKEY" + and ( + ("N1"."N_NAME" = 'EGYPT' and "N2"."N_NAME" = 'UNITED STATES') + or ("N1"."N_NAME" = 'UNITED STATES' and "N2"."N_NAME" = 'EGYPT') + ) + and "L"."L_SHIPDATE" between date '1995-01-01' and date '1996-12-31' + ) as "SHIPPING" +group by + "SUPP_NATION", + "CUST_NATION", + "L_YEAR" +order by + "SUPP_NATION", + "CUST_NATION", + "L_YEAR" diff --git a/isthmus/src/test/resources/tpch/postgresql/08.sql b/isthmus/src/test/resources/tpch/postgresql/08.sql new file mode 100644 index 000000000..ec9f52f9c --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/08.sql @@ -0,0 +1,37 @@ +select + "O_YEAR", + sum(case + when "NATION" = 'EGYPT' then "VOLUME" + else 0 + end) / sum("VOLUME") as "MKT_SHARE" +from + ( + select + extract(year from "O"."O_ORDERDATE") as "O_YEAR", + "L"."L_EXTENDEDPRICE" * (1 - "L"."L_DISCOUNT") as "VOLUME", + "N2"."N_NAME" as "NATION" + from + "PART" "P", + "SUPPLIER" "S", + "LINEITEM" "L", + "ORDERS" "O", + "CUSTOMER" "C", + "NATION" "N1", + "NATION" "N2", + "REGION" "R" + where + "P"."P_PARTKEY" = "L"."L_PARTKEY" + and "S"."S_SUPPKEY" = "L"."L_SUPPKEY" + and "L"."L_ORDERKEY" = "O"."O_ORDERKEY" + and "O"."O_CUSTKEY" = "C"."C_CUSTKEY" + and "C"."C_NATIONKEY" = "N1"."N_NATIONKEY" + and "N1"."N_REGIONKEY" = "R"."R_REGIONKEY" + and "R"."R_NAME" = 'MIDDLE EAST' + and "S"."S_NATIONKEY" = "N2"."N_NATIONKEY" + and "O"."O_ORDERDATE" between date '1995-01-01' and date '1996-12-31' + and "P"."P_TYPE" = 'PROMO BRUSHED COPPER' + ) as "ALL_NATIONS" +group by + "O_YEAR" +order by + "O_YEAR" diff --git a/isthmus/src/test/resources/tpch/postgresql/09.sql b/isthmus/src/test/resources/tpch/postgresql/09.sql new file mode 100644 index 000000000..3f4422fdd --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/09.sql @@ -0,0 +1,32 @@ +select + "NATION", + "O_YEAR", + sum("AMOUNT") as "SUM_PROFIT" +from + ( + select + "N"."N_NAME" as "NATION", + extract(year from "O"."O_ORDERDATE") as "O_YEAR", + "L"."L_EXTENDEDPRICE" * (1 - "L"."L_DISCOUNT") - "PS"."PS_SUPPLYCOST" * "L"."L_QUANTITY" as "AMOUNT" + from + "PART" "P", + "SUPPLIER" "S", + "LINEITEM" "L", + "PARTSUPP" "PS", + "ORDERS" "O", + "NATION" "N" + where + "S"."S_SUPPKEY" = "L"."L_SUPPKEY" + and "PS"."PS_SUPPKEY" = "L"."L_SUPPKEY" + and "PS"."PS_PARTKEY" = "L"."L_PARTKEY" + and "P"."P_PARTKEY" = "L"."L_PARTKEY" + and "O"."O_ORDERKEY" = "L"."L_ORDERKEY" + and "S"."S_NATIONKEY" = "N"."N_NATIONKEY" + and "P"."P_NAME" like '%yellow%' + ) as "PROFIT" +group by + "NATION", + "O_YEAR" +order by + "NATION", + "O_YEAR" desc diff --git a/isthmus/src/test/resources/tpch/postgresql/10.sql b/isthmus/src/test/resources/tpch/postgresql/10.sql new file mode 100644 index 000000000..9557250f6 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/10.sql @@ -0,0 +1,32 @@ +select + "C"."C_CUSTKEY", + "C"."C_NAME", + sum("L"."L_EXTENDEDPRICE" * (1 - "L"."L_DISCOUNT")) as "REVENUE", + "C"."C_ACCTBAL", + "N"."N_NAME", + "C"."C_ADDRESS", + "C"."C_PHONE", + "C"."C_COMMENT" +from + "CUSTOMER" "C", + "ORDERS" "O", + "LINEITEM" "L", + "NATION" "N" +where + "C"."C_CUSTKEY" = "O"."O_CUSTKEY" + and "L"."L_ORDERKEY" = "O"."O_ORDERKEY" + and "O"."O_ORDERDATE" >= date '1994-03-01' + and "O"."O_ORDERDATE" < date '1994-03-01' + interval '3 months' + and "L"."L_RETURNFLAG" = 'R' + and "C"."C_NATIONKEY" = "N"."N_NATIONKEY" +group by + "C"."C_CUSTKEY", + "C"."C_NAME", + "C"."C_ACCTBAL", + "C"."C_PHONE", + "N"."N_NAME", + "C"."C_ADDRESS", + "C"."C_COMMENT" +order by + "REVENUE" desc +limit 20 diff --git a/isthmus/src/test/resources/tpch/postgresql/11.sql b/isthmus/src/test/resources/tpch/postgresql/11.sql new file mode 100644 index 000000000..fcf3fcff5 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/11.sql @@ -0,0 +1,27 @@ +select + "PS"."PS_PARTKEY", + sum("PS"."PS_SUPPLYCOST" * "PS"."PS_AVAILQTY") as "VALUE" +from + "PARTSUPP" "PS", + "SUPPLIER" "S", + "NATION" "N" +where + "PS"."PS_SUPPKEY" = "S"."S_SUPPKEY" + and "S"."S_NATIONKEY" = "N"."N_NATIONKEY" + and "N"."N_NAME" = 'JAPAN' +group by + "PS"."PS_PARTKEY" having + sum("PS"."PS_SUPPLYCOST" * "PS"."PS_AVAILQTY") > ( + select + sum("PS"."PS_SUPPLYCOST" * "PS"."PS_AVAILQTY") * 0.0001000000 + from + "PARTSUPP" "PS", + "SUPPLIER" "S", + "NATION" "N" + where + "PS"."PS_SUPPKEY" = "S"."S_SUPPKEY" + and "S"."S_NATIONKEY" = "N"."N_NATIONKEY" + and "N"."N_NAME" = 'JAPAN' + ) +order by + "VALUE" desc diff --git a/isthmus/src/test/resources/tpch/postgresql/12.sql b/isthmus/src/test/resources/tpch/postgresql/12.sql new file mode 100644 index 000000000..5413a674e --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/12.sql @@ -0,0 +1,28 @@ +select + "L"."L_SHIPMODE", + sum(case + when "O"."O_ORDERPRIORITY" = '1-URGENT' + or "O"."O_ORDERPRIORITY" = '2-HIGH' + then 1 + else 0 + end) as "HIGH_LINE_COUNT", + sum(case + when "O"."O_ORDERPRIORITY" <> '1-URGENT' + and "O"."O_ORDERPRIORITY" <> '2-HIGH' + then 1 + else 0 + end) as "LOW_LINE_COUNT" +from + "ORDERS" "O", + "LINEITEM" "L" +where + "O"."O_ORDERKEY" = "L"."L_ORDERKEY" + and "L"."L_SHIPMODE" in ('TRUCK', 'REG AIR') + and "L"."L_COMMITDATE" < "L"."L_RECEIPTDATE" + and "L"."L_SHIPDATE" < "L"."L_COMMITDATE" + and "L"."L_RECEIPTDATE" >= date '1994-01-01' + and "L"."L_RECEIPTDATE" < date '1994-01-01' + interval '1 year' +group by + "L"."L_SHIPMODE" +order by + "L"."L_SHIPMODE" diff --git a/isthmus/src/test/resources/tpch/postgresql/13.sql b/isthmus/src/test/resources/tpch/postgresql/13.sql new file mode 100644 index 000000000..a2bcd2155 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/13.sql @@ -0,0 +1,21 @@ +select + "C_COUNT", + count(*) as "CUSTDIST" +from + ( + select + "C"."C_CUSTKEY", + count("O"."O_ORDERKEY") + from + "CUSTOMER" "C" + left outer join "ORDERS" "O" + on "C"."C_CUSTKEY" = "O"."O_CUSTKEY" + and "O"."O_COMMENT" not like '%special%requests%' + group by + "C"."C_CUSTKEY" + ) as "ORDERS" ("C_CUSTKEY", "C_COUNT") +group by + "C_COUNT" +order by + "CUSTDIST" desc, + "C_COUNT" desc diff --git a/isthmus/src/test/resources/tpch/postgresql/14.sql b/isthmus/src/test/resources/tpch/postgresql/14.sql new file mode 100644 index 000000000..62ca157db --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/14.sql @@ -0,0 +1,13 @@ +select + 100.00 * sum(case + when "P"."P_TYPE" like 'PROMO%' + then "L"."L_EXTENDEDPRICE" * (1 - "L"."L_DISCOUNT") + else 0 + end) / sum("L"."L_EXTENDEDPRICE" * (1 - "L"."L_DISCOUNT")) as "PROMO_REVENUE" +from + "LINEITEM" "L", + "PART" "P" +where + "L"."L_PARTKEY" = "P"."P_PARTKEY" + and "L"."L_SHIPDATE" >= date '1994-08-01' + and "L"."L_SHIPDATE" < date '1994-08-01' + interval '1 month' diff --git a/isthmus/src/test/resources/tpch/postgresql/15.sql b/isthmus/src/test/resources/tpch/postgresql/15.sql new file mode 100644 index 000000000..d2f77a922 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/15.sql @@ -0,0 +1,32 @@ +-- converted to CTE since DDL is not part of Substrait. +with "REVENUE0"("SUPPLIER_NO", "TOTAL_REVENUE") as ( + select + "L_SUPPKEY", + sum("L_EXTENDEDPRICE" * (1 - "L_DISCOUNT")) + from + "LINEITEM" + where + "L_SHIPDATE" >= date '1993-05-01' + and "L_SHIPDATE" < date '1993-05-01' + interval '3 month' + group by + "L_SUPPKEY") + +select + "S"."S_SUPPKEY", + "S"."S_NAME", + "S"."S_ADDRESS", + "S"."S_PHONE", + "R"."TOTAL_REVENUE" +from + "SUPPLIER" "S", + "REVENUE0" "R" +where + "S"."S_SUPPKEY" = "R"."SUPPLIER_NO" + and "R"."TOTAL_REVENUE" = ( + select + max("TOTAL_REVENUE") + from + "REVENUE0" + ) +order by + "S"."S_SUPPKEY" diff --git a/isthmus/src/test/resources/tpch/postgresql/16.sql b/isthmus/src/test/resources/tpch/postgresql/16.sql new file mode 100644 index 000000000..8744bb7bf --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/16.sql @@ -0,0 +1,30 @@ +select + "P"."P_BRAND", + "P"."P_TYPE", + "P"."P_SIZE", + count(distinct "PS"."PS_SUPPKEY") as "SUPPLIER_CNT" +from + "PARTSUPP" "PS", + "PART" "P" +where + "P"."P_PARTKEY" = "PS"."PS_PARTKEY" + and "P"."P_BRAND" <> 'Brand#21' + and "P"."P_TYPE" not like 'MEDIUM PLATED%' + and "P"."P_SIZE" in (38, 2, 8, 31, 44, 5, 14, 24) + and "PS"."PS_SUPPKEY" not in ( + select + "S"."S_SUPPKEY" + from + "SUPPLIER" "S" + where + "S"."S_COMMENT" like '%Customer%Complaints%' + ) +group by + "P"."P_BRAND", + "P"."P_TYPE", + "P"."P_SIZE" +order by + "SUPPLIER_CNT" desc, + "P"."P_BRAND", + "P"."P_TYPE", + "P"."P_SIZE" diff --git a/isthmus/src/test/resources/tpch/postgresql/17.sql b/isthmus/src/test/resources/tpch/postgresql/17.sql new file mode 100644 index 000000000..14d129c50 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/17.sql @@ -0,0 +1,17 @@ +select + sum("L"."L_EXTENDEDPRICE") / 7.0 as "AVG_YEARLY" +from + "LINEITEM" "L", + "PART" "P" +where + "P"."P_PARTKEY" = "L"."L_PARTKEY" + and "P"."P_BRAND" = 'Brand#13' + and "P"."P_CONTAINER" = 'JUMBO CAN' + and "L"."L_QUANTITY" < ( + select + 0.2 * avg("L2"."L_QUANTITY") + from + "LINEITEM" "L2" + where + "L2"."L_PARTKEY" = "P"."P_PARTKEY" + ) diff --git a/isthmus/src/test/resources/tpch/postgresql/18.sql b/isthmus/src/test/resources/tpch/postgresql/18.sql new file mode 100644 index 000000000..49e618623 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/18.sql @@ -0,0 +1,33 @@ +select + "C"."C_NAME", + "C"."C_CUSTKEY", + "O"."O_ORDERKEY", + "O"."O_ORDERDATE", + "O"."O_TOTALPRICE", + sum("L"."L_QUANTITY") +from + "CUSTOMER" "C", + "ORDERS" "O", + "LINEITEM" "L" +where + "O"."O_ORDERKEY" in ( + select + "L_ORDERKEY" + from + "LINEITEM" + group by + "L_ORDERKEY" having + sum("L_QUANTITY") > 300 + ) + and "C"."C_CUSTKEY" = "O"."O_CUSTKEY" + and "O"."O_ORDERKEY" = "L"."L_ORDERKEY" +group by + "C"."C_NAME", + "C"."C_CUSTKEY", + "O"."O_ORDERKEY", + "O"."O_ORDERDATE", + "O"."O_TOTALPRICE" +order by + "O"."O_TOTALPRICE" desc, + "O"."O_ORDERDATE" +limit 100 diff --git a/isthmus/src/test/resources/tpch/postgresql/19.sql b/isthmus/src/test/resources/tpch/postgresql/19.sql new file mode 100644 index 000000000..f47147ef7 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/19.sql @@ -0,0 +1,35 @@ +select + sum("L"."L_EXTENDEDPRICE"* (1 - "L"."L_DISCOUNT")) as "REVENUE" +from + "LINEITEM" "L", + "PART" "P" +where + ( + "P"."P_PARTKEY" = "L"."L_PARTKEY" + and "P"."P_BRAND" = 'Brand#41' + and "P"."P_CONTAINER" in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and "L"."L_QUANTITY" >= 2 and "L"."L_QUANTITY" <= 2 + 10 + and "P"."P_SIZE" between 1 and 5 + and "L"."L_SHIPMODE" in ('AIR', 'AIR REG') + and "L"."L_SHIPINSTRUCT" = 'DELIVER IN PERSON' + ) + or + ( + "P"."P_PARTKEY" = "L"."L_PARTKEY" + and "P"."P_BRAND" = 'Brand#13' + and "P"."P_CONTAINER" in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and "L"."L_QUANTITY" >= 14 and "L"."L_QUANTITY" <= 14 + 10 + and "P"."P_SIZE" between 1 and 10 + and "L"."L_SHIPMODE" in ('AIR', 'AIR REG') + and "L"."L_SHIPINSTRUCT" = 'DELIVER IN PERSON' + ) + or + ( + "P"."P_PARTKEY" = "L"."L_PARTKEY" + and "P"."P_BRAND" = 'Brand#55' + and "P"."P_CONTAINER" in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and "L"."L_QUANTITY" >= 23 and "L"."L_QUANTITY" <= 23 + 10 + and "P"."P_SIZE" between 1 and 15 + and "L"."L_SHIPMODE" in ('AIR', 'AIR REG') + and "L"."L_SHIPINSTRUCT" = 'DELIVER IN PERSON' + ) diff --git a/isthmus/src/test/resources/tpch/postgresql/20.sql b/isthmus/src/test/resources/tpch/postgresql/20.sql new file mode 100644 index 000000000..b2ec46ae7 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/20.sql @@ -0,0 +1,37 @@ +select + "S"."S_NAME", + "S"."S_ADDRESS" +from + "SUPPLIER" "S", + "NATION" "N" +where + "S"."S_SUPPKEY" in ( + select + "PS"."PS_SUPPKEY" + from + "PARTSUPP" "PS" + where + "PS"."PS_PARTKEY" in ( + select + "P"."P_PARTKEY" + from + "PART" "P" + where + "P"."P_NAME" like 'antique%' + ) + and "PS"."PS_AVAILQTY" > ( + select + 0.5 * sum("L"."L_QUANTITY") + from + "LINEITEM" "L" + where + "L"."L_PARTKEY" = "PS"."PS_PARTKEY" + and "L"."L_SUPPKEY" = "PS"."PS_SUPPKEY" + and "L"."L_SHIPDATE" >= date '1993-01-01' + and "L"."L_SHIPDATE" < date '1993-01-01' + interval '1 year' + ) + ) + and "S"."S_NATIONKEY" = "N"."N_NATIONKEY" + and "N"."N_NAME" = 'KENYA' +order by + "S"."S_NAME" diff --git a/isthmus/src/test/resources/tpch/postgresql/21.sql b/isthmus/src/test/resources/tpch/postgresql/21.sql new file mode 100644 index 000000000..ad87a2f02 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/21.sql @@ -0,0 +1,40 @@ +select + "S"."S_NAME", + count(*) as "NUMWAIT" +from + "SUPPLIER" "S", + "LINEITEM" "L1", + "ORDERS" "O", + "NATION" "N" +where + "S"."S_SUPPKEY" = "L1"."L_SUPPKEY" + and "O"."O_ORDERKEY" = "L1"."L_ORDERKEY" + and "O"."O_ORDERSTATUS" = 'F' + and "L1"."L_RECEIPTDATE" > "L1"."L_COMMITDATE" + and exists ( + select + * + from + "LINEITEM" "L2" + where + "L2"."L_ORDERKEY" = "L1"."L_ORDERKEY" + and "L2"."L_SUPPKEY" <> "L1"."L_SUPPKEY" + ) + and not exists ( + select + * + from + "LINEITEM" "L3" + where + "L3"."L_ORDERKEY" = "L1"."L_ORDERKEY" + and "L3"."L_SUPPKEY" <> "L1"."L_SUPPKEY" + and "L3"."L_RECEIPTDATE" > "L3"."L_COMMITDATE" + ) + and "S"."S_NATIONKEY" = "N"."N_NATIONKEY" + and "N"."N_NAME" = 'BRAZIL' +group by + "S"."S_NAME" +order by + "NUMWAIT" desc, + "S"."S_NAME" +limit 100 diff --git a/isthmus/src/test/resources/tpch/postgresql/22.sql b/isthmus/src/test/resources/tpch/postgresql/22.sql new file mode 100644 index 000000000..6575869ec --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/22.sql @@ -0,0 +1,37 @@ +select + "CNTRYCODE", + count(*) as "NUMCUST", + sum("C_ACCTBAL") as "TOTACCTBAL" +from + ( + select + substring("C_PHONE" from 1 for 2) as "CNTRYCODE", + "C_ACCTBAL" + from + "CUSTOMER" "C" + where + substring("C_PHONE" from 1 for 2) in + ('24', '31', '11', '16', '21', '20', '34') + and "C_ACCTBAL" > ( + select + avg("C_ACCTBAL") + from + "CUSTOMER" + where + "C_ACCTBAL" > 0.00 + and substring("C_PHONE" from 1 for 2) in + ('24', '31', '11', '16', '21', '20', '34') + ) + and not exists ( + select + * + from + "ORDERS" "O" + where + "O"."O_CUSTKEY" = "C"."C_CUSTKEY" + ) + ) as "CUSTSALE" +group by + "CNTRYCODE" +order by + "CNTRYCODE" diff --git a/isthmus/src/test/resources/tpch/postgresql/tpch_init.sql b/isthmus/src/test/resources/tpch/postgresql/tpch_init.sql new file mode 100644 index 000000000..19cba2418 --- /dev/null +++ b/isthmus/src/test/resources/tpch/postgresql/tpch_init.sql @@ -0,0 +1,100 @@ +CREATE TABLE "PART" ( + "P_PARTKEY" BIGINT NOT NULL, + "P_NAME" VARCHAR(55), + "P_MFGR" CHAR(25), + "P_BRAND" CHAR(10), + "P_TYPE" VARCHAR(25), + "P_SIZE" INTEGER, + "P_CONTAINER" CHAR(10), + "P_RETAILPRICE" DECIMAL, + "P_COMMENT" VARCHAR(23) +); + +COPY "PART" FROM '/tmp/tpc-h/part.csv' WITH CSV HEADER; + +CREATE TABLE "SUPPLIER" ( + "S_SUPPKEY" BIGINT NOT NULL, + "S_NAME" CHAR(25), + "S_ADDRESS" VARCHAR(40), + "S_NATIONKEY" BIGINT NOT NULL, + "S_PHONE" CHAR(15), + "S_ACCTBAL" DECIMAL, + "S_COMMENT" VARCHAR(101) +); + +COPY "SUPPLIER" FROM '/tmp/tpc-h/supplier.csv' WITH CSV HEADER; + +CREATE TABLE "PARTSUPP" ( + "PS_PARTKEY" BIGINT NOT NULL, + "PS_SUPPKEY" BIGINT NOT NULL, + "PS_AVAILQTY" INTEGER, + "PS_SUPPLYCOST" DECIMAL, + "PS_COMMENT" VARCHAR(199) +); + +COPY "PARTSUPP" FROM '/tmp/tpc-h/partsupp.csv' WITH CSV HEADER; + +CREATE TABLE "CUSTOMER" ( + "C_CUSTKEY" BIGINT NOT NULL, + "C_NAME" VARCHAR(25), + "C_ADDRESS" VARCHAR(40), + "C_NATIONKEY" BIGINT NOT NULL, + "C_PHONE" CHAR(15), + "C_ACCTBAL" DECIMAL, + "C_MKTSEGMENT" CHAR(10), + "C_COMMENT" VARCHAR(117) +); + +COPY "CUSTOMER" FROM '/tmp/tpc-h/customer.csv' WITH CSV HEADER; + +CREATE TABLE "ORDERS" ( + "O_ORDERKEY" BIGINT NOT NULL, + "O_CUSTKEY" BIGINT NOT NULL, + "O_ORDERSTATUS" CHAR(1), + "O_TOTALPRICE" DECIMAL, + "O_ORDERDATE" DATE, + "O_ORDERPRIORITY" CHAR(15), + "O_CLERK" CHAR(15), + "O_SHIPPRIORITY" INTEGER, + "O_COMMENT" VARCHAR(79) +); + +COPY "ORDERS" FROM '/tmp/tpc-h/orders.csv' WITH CSV HEADER; + +CREATE TABLE "LINEITEM" ( + "L_ORDERKEY" BIGINT NOT NULL, + "L_PARTKEY" BIGINT NOT NULL, + "L_SUPPKEY" BIGINT NOT NULL, + "L_LINENUMBER" INTEGER, + "L_QUANTITY" DECIMAL, + "L_EXTENDEDPRICE" DECIMAL, + "L_DISCOUNT" DECIMAL, + "L_TAX" DECIMAL, + "L_RETURNFLAG" CHAR(1), + "L_LINESTATUS" CHAR(1), + "L_SHIPDATE" DATE, + "L_COMMITDATE" DATE, + "L_RECEIPTDATE" DATE, + "L_SHIPINSTRUCT" CHAR(25), + "L_SHIPMODE" CHAR(10), + "L_COMMENT" VARCHAR(44) +); + +COPY "LINEITEM" FROM '/tmp/tpc-h/lineitem.csv' WITH CSV HEADER; + +CREATE TABLE "NATION" ( + "N_NATIONKEY" BIGINT NOT NULL, + "N_NAME" CHAR(25), + "N_REGIONKEY" BIGINT NOT NULL, + "N_COMMENT" VARCHAR(152) +); + +COPY "NATION" FROM '/tmp/tpc-h/nation.csv' WITH CSV HEADER; + +CREATE TABLE "REGION" ( + "R_REGIONKEY" BIGINT NOT NULL, + "R_NAME" CHAR(25), + "R_COMMENT" VARCHAR(152) +); + +COPY "REGION" FROM '/tmp/tpc-h/region.csv' WITH CSV HEADER; diff --git a/readme.md b/readme.md index 5c0298c35..bfe9ffd6b 100644 --- a/readme.md +++ b/readme.md @@ -17,6 +17,19 @@ To build the Isthmus executable that enables Substrait plans to be generated for ./gradlew nativeCompile ``` +### Integration Tests + +Integration tests in this module are tagged with `@Tag("integration")` and are **skipped by default** during normal test runs. This is because they: +- Require Docker to be running. +- May take longer to execute. +- Use external resources (PostgreSQL containers, test data generation). + +To run **only** the integration tests: + +```bash +./gradlew integrationTest +``` + ## Getting Started A good way to get started is to experiment with building Substrait plans for your own SQL. To do that, you can use the isthmus executable as described [here](https://github.com/substrait-io/substrait-java/blob/main/isthmus/README.md).