By Haoran Wang, Sr. Big Data Solution Architect of Alibaba Cloud
Add the following information, then click reload.
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>AliDemo</groupId>
<artifactId>testSpark2</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<spark.version>2.4.5</spark.version>
<cupid.sdk.version>3.3.8-public</cupid.sdk.version>
<scala.version>2.11.8</scala.version>
<scala.binary.version>2.11</scala.binary.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
</exclusion>
<exclusion>
<groupId>org.scala-lang</groupId>
<artifactId>scalap</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>cupid-sdk</artifactId>
<version>${cupid.sdk.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>hadoop-fs-oss</artifactId>
<version>${cupid.sdk.version}</version>
</dependency>
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>odps-spark-datasource_${scala.binary.version}</artifactId>
<version>${cupid.sdk.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-actors</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- datahub streaming依赖 -->
<dependency>
<groupId>com.aliyun.emr</groupId>
<artifactId>emr-datahub_${scala.binary.version}</artifactId>
<version>1.6.0</version>
</dependency>
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>streaming-lib</artifactId>
<version>3.3.8-public</version>
<exclusions>
<exclusion>
<groupId>net.jpountz.lz4</groupId>
<artifactId>lz4</artifactId>
</exclusion>
<exclusion>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
</exclusion>
<exclusion>
<groupId>org.scala-lang</groupId>
<artifactId>scalap</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>
//package com.aliyun.odps.spark.examples.sparksql;
import com.aliyun.odps.Odps;
import com.aliyun.odps.cupid.CupidSession;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.*;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.sql.types.StructField;
public class SparkDemo2 {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("SparkSQL-on-MaxCompute")
.config("spark.sql.broadcastTimeout", 20 * 60)
.config("spark.sql.crossJoin.enabled", true)
.config("odps.exec.dynamic.partition.mode", "nonstrict")
.getOrCreate();
JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());
String tableName = "mc_test_table";
String tableNameCopy = "mc_test_table_copy";
String ptTableName = "mc_test_pt_table";
spark.sql("DROP TABLE IF EXISTS " + tableName);
spark.sql("DROP TABLE IF EXISTS " + tableNameCopy);
spark.sql("DROP TABLE IF EXISTS " + ptTableName);
spark.sql("CREATE TABLE " + tableName + " (name STRING, num BIGINT)");
spark.sql("CREATE TABLE " + ptTableName + " (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)");
List<Integer> data = new ArrayList<Integer>();
for (int i = 0; i < 100; i++) {
data.add(i);
}
JavaRDD<Row> dfRDD = sparkContext.parallelize(data, 2).map(new Function<Integer, Row>() {
public Row call(Integer i) {
return RowFactory.create(
"name-" + i.toString(),
Long.valueOf(i));
}
});
JavaRDD<Row> ptDfRDD = sparkContext.parallelize(data, 2).map(new Function<Integer, Row>() {
public Row call(Integer i) {
return RowFactory.create(
"name-" + i.toString(),
Long.valueOf(i),
"2018",
"0601");
}
});
List<StructField> structFilelds = new ArrayList<StructField>();
structFilelds.add(DataTypes.createStructField("name", DataTypes.StringType, true));
structFilelds.add(DataTypes.createStructField("num", DataTypes.LongType, true));
Dataset<Row> df = spark.createDataFrame(dfRDD, DataTypes.createStructType(structFilelds));
structFilelds.add(DataTypes.createStructField("pt1", DataTypes.StringType, true));
structFilelds.add(DataTypes.createStructField("pt2", DataTypes.StringType, true));
Dataset<Row> ptDf = spark.createDataFrame(ptDfRDD, DataTypes.createStructType(structFilelds));
// write regular table
df.write().insertInto(tableName); // insertInto语义
df.write().mode("overwrite").insertInto(tableName);// insertOverwrite语义
// read regular table
Dataset<Row> rdf = spark.sql("select name, num from " + tableName);
System.out.println("rdf count: " + rdf.count());
rdf.printSchema();
//create table as select
spark.sql("CREATE TABLE " + tableNameCopy + " AS SELECT name, num FROM " + tableName);
spark.sql("SELECT * FROM " + tableNameCopy).show();
// write partitioned table
// DataFrameWriter cannot specify partition writing, need to use temporary table and then write to a specific partition with SQL
df.registerTempTable(ptTableName + "_tmp_view");
spark.sql("insert into table " + ptTableName + " partition (pt1='2018', pt2='0601') select * from " + ptTableName + "_tmp_view");
spark.sql("insert overwrite table " + ptTableName + " partition (pt1='2018', pt2='0601') select * from " + ptTableName + "_tmp_view");
ptDf.write().insertInto(ptTableName);// dynamic partition insertInto semantics
ptDf.write().mode("overwrite").insertInto(ptTableName); // dynamic partition insertOverwrite semantics
// read partitioned table
Dataset<Row> rptdf = spark.sql("select name, num, pt1, pt2 from " + ptTableName + " where pt1 = '2018' and pt2 = '0601'");
System.out.println("rptdf count: " + rptdf.count());
rptdf.printSchema();
Odps odps = CupidSession.get().odps();
System.out.println(odps.tables().get(ptTableName).getPartitions().size());
System.out.println(odps.tables().get(ptTableName).getPartitions().get(0).getPartitionSpec());
}
}
Scala | 2.11.8 |
JDK | 1.8 |
Spark | 2.x |
import org.apache.spark.sql.{SaveMode, SparkSession}
object TestScala {
def main(args: Array[String]) {
val spark = SparkSession
.builder()
.appName("SparkSQL-on-MaxCompute")
.config("spark.sql.broadcastTimeout", 20 * 60)
.config("spark.sql.crossJoin.enabled", true)
.config("odps.exec.dynamic.partition.mode", "nonstrict")
.getOrCreate()
// val project = spark.conf.get("odps.project.name")
import spark._
import sqlContext.implicits._
val tableName = "mc_test_table"
val ptTableName = "mc_test_pt_table"
// Drop Create
sql(s"DROP TABLE IF EXISTS ${tableName}")
sql(s"DROP TABLE IF EXISTS ${ptTableName}")
sql(s"CREATE TABLE ${tableName} (name STRING, num BIGINT)")
sql(s"CREATE TABLE ${ptTableName} (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)")
val df = spark.sparkContext.parallelize(0 to 99, 2).map(f => {
(s"name-$f", f)
}).toDF("name", "num")
val ptDf = spark.sparkContext.parallelize(0 to 99, 2).map(f => {
(s"name-$f", f, "2018", "0601")
}).toDF("name", "num", "pt1", "pt2")
// write regular table
df.write.insertInto(tableName) // insertInto semantics
df.write.mode("overwrite").insertInto(tableName) // insertOverwrite semantics
// write partitioned table
// DataFrameWriter cannot specify partition writing, need to use temporary table and then write to a specific partition with SQL
df.createOrReplaceTempView(s"${ptTableName}_tmp_view")
sql(s"insert into table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")
sql(s"insert overwrite table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")
ptDf.write.insertInto(ptTableName) // dynamic partition insertInto semantics
ptDf.write.mode("overwrite").insertInto(ptTableName) // dynamic partition insertOverwrite semantics
// read regular table
val rdf = sql(s"select name, num from $tableName")
println(s"rdf count, ${rdf.count()}")
rdf.printSchema()
// read partitioned table
val rptdf = sql(s"select name, num, pt1, pt2 from $ptTableName where pt1 = '2018' and pt2 = '0601'")
println(s"rptdf count, ${rptdf.count()}")
rptdf.printSchema()
}
}
JDK | 1.8 |
Scala | 2.11 or 2.10 |
Spark | 3.3.x or 3.0.x |
POM.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>AliDemo</groupId>
<artifactId>testSpark3</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<spark.version>3.1.1</spark.version>
<cupid.sdk.version>3.3.8-public</cupid.sdk.version>
<scala.version>2.12.12</scala.version>
<scala.binary.version>2.12</scala.binary.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.aliyun.odps/odps-spark-datasource -->
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>cupid-sdk</artifactId>
<version>${cupid.sdk.version}</version>
</dependency>
<dependency>
<groupId>com.aliyun.odps</groupId>
<artifactId>hadoop-fs-oss</artifactId>
<version>${cupid.sdk.version}</version>
</dependency>
</dependencies>
</project>
SparkDemo3.java
import com.aliyun.odps.Odps;
import com.aliyun.odps.cupid.CupidSession;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.*;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.sql.types.StructField;
public class SparkDemo3 {
public static void main(String[] args) throws Exception {
SparkSession spark = SparkSession
.builder()
.appName("SparkSQL-on-MaxCompute")
.config("spark.sql.defaultCatalog","odps")
.config("spark.sql.catalog.odps", "org.apache.spark.sql.execution.datasources.v2.odps.OdpsTableCatalog")
.config("spark.sql.sources.partitionOverwriteMode", "dynamic")
.config("spark.sql.extensions", "org.apache.spark.sql.execution.datasources.v2.odps.extension.OdpsExtensions")
.config("spark.sql.catalogImplementation","hive")
.getOrCreate();
JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());
String tableName = "mc_test_table";
String tableNameCopy = "mc_test_table_copy";
String ptTableName = "mc_test_pt_table";
spark.sql("DROP TABLE IF EXISTS " + tableName);
spark.sql("DROP TABLE IF EXISTS " + tableNameCopy);
spark.sql("DROP TABLE IF EXISTS " + ptTableName);
spark.sql("CREATE TABLE " + tableName + " (name STRING, num BIGINT)");
spark.sql("CREATE TABLE " + ptTableName + " (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)");
List<Integer> data = new ArrayList<Integer>();
for (int i = 0; i < 100; i++) {
data.add(i);
}
JavaRDD<Row> dfRDD = sparkContext.parallelize(data, 2).map(new Function<Integer, Row>() {
public Row call(Integer i) {
return RowFactory.create(
"name-" + i.toString(),
Long.valueOf(i));
}
});
JavaRDD<Row> ptDfRDD = sparkContext.parallelize(data, 2).map(new Function<Integer, Row>() {
public Row call(Integer i) {
return RowFactory.create(
"name-" + i.toString(),
Long.valueOf(i),
"2018",
"0601");
}
});
List<StructField> structFilelds = new ArrayList<StructField>();
structFilelds.add(DataTypes.createStructField("name", DataTypes.StringType, true));
structFilelds.add(DataTypes.createStructField("num", DataTypes.LongType, true));
Dataset<Row> df = spark.createDataFrame(dfRDD, DataTypes.createStructType(structFilelds));
structFilelds.add(DataTypes.createStructField("pt1", DataTypes.StringType, true));
structFilelds.add(DataTypes.createStructField("pt2", DataTypes.StringType, true));
Dataset<Row> ptDf = spark.createDataFrame(ptDfRDD, DataTypes.createStructType(structFilelds));
// write regular table
df.write().insertInto(tableName); // insertInto semantic
df.writeTo(tableName).overwritePartitions(); // insertOverwrite use datasourcev2
// read regular table
Dataset<Row> rdf = spark.sql("select name, num from " + tableName);
System.out.println("rdf count: " + rdf.count());
rdf.printSchema();
//create table as select
spark.sql("CREATE TABLE " + tableNameCopy + " AS SELECT name, num FROM " + tableName);
spark.sql("SELECT * FROM " + tableNameCopy).show();
// write partitioned table
// DataFrameWriter cannot specify partition writing, need to use temporary table and then write to a specific partition with SQL
df.registerTempTable(ptTableName + "_tmp_view");
spark.sql("insert into table " + ptTableName + " partition (pt1='2018', pt2='0601') select * from " + ptTableName + "_tmp_view");
spark.sql("insert overwrite table " + ptTableName + " partition (pt1='2018', pt2='0601') select * from " + ptTableName + "_tmp_view");
ptDf.write().insertInto(ptTableName);// dynamic partition insertInto semantics
ptDf.write().mode("overwrite").insertInto(ptTableName); // dynamic partition insertOverwrite semantics
// read partitioned table
Dataset<Row> rptdf = spark.sql("select name, num, pt1, pt2 from " + ptTableName + " where pt1 = '2018' and pt2 = '0601'");
System.out.println("rptdf count: " + rptdf.count());
rptdf.printSchema();
// example for use odps
Odps odps = CupidSession.get().odps();
System.out.println(odps.tables().get(ptTableName).getPartitions().size());
System.out.println(odps.tables().get(ptTableName).getPartitions().get(0).getPartitionSpec());
}
}
Scala | 2.11.8 |
JDK | 1.8 |
Spark | 3.x |
import org.apache.spark.sql.{SaveMode, SparkSession}
object ScalaDemo {
def main(args: Array[String]) {
val spark = SparkSession
.builder()
.appName("SparkSQL-on-MaxCompute")
.config("spark.sql.broadcastTimeout", 20 * 60)
.config("spark.sql.crossJoin.enabled", true)
.config("spark.sql.defaultCatalog","odps")
.config("spark.sql.catalog.odps", "org.apache.spark.sql.execution.datasources.v2.odps.OdpsTableCatalog")
.config("spark.sql.sources.partitionOverwriteMode", "dynamic")
.config("spark.sql.extensions", "org.apache.spark.sql.execution.datasources.v2.odps.extension.OdpsExtensions")
.config("spark.sql.catalogImplementation","hive")
.getOrCreate()
import spark._
import sqlContext.implicits._
val tableName = "mc_test_table"
val ptTableName = "mc_test_pt_table"
// Drop Create
sql(s"DROP TABLE IF EXISTS ${tableName}")
sql(s"DROP TABLE IF EXISTS ${ptTableName}")
sql(s"CREATE TABLE ${tableName} (name STRING, num BIGINT)")
sql(s"CREATE TABLE ${ptTableName} (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)")
val df = spark.sparkContext.parallelize(0 to 99, 2).map(f => {
(s"name-$f", f)
}).toDF("name", "num")
val ptDf = spark.sparkContext.parallelize(0 to 99, 2).map(f => {
(s"name-$f", f, "2018", "0601")
}).toDF("name", "num", "pt1", "pt2")
// write regular table
df.write.insertInto(tableName) // insertInto语义
df.writeTo(tableName).overwritePartitions() // insertOverwrite use datasourceV2
// write partitioned table
// DataFrameWriter cannot specify partition writing, need to use temporary table and then write to a specific partition with SQL
df.createOrReplaceTempView(s"${ptTableName}_tmp_view")
sql(s"insert into table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")
sql(s"insert overwrite table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")
ptDf.write.insertInto(ptTableName) // dynamic partition insertInto semantics
ptDf.write.mode("overwrite").insertInto(ptTableName) // dynamic partition insertOverwrite semantics
// read regular table
val rdf = sql(s"select name, num from $tableName")
println(s"rdf show, ${rdf.count()}")
rdf.show()
rdf.printSchema()
// read partitioned table
val rptdf = sql(s"select name, num, pt1, pt2 from $ptTableName where pt1 = '2018' and pt2 = '0601'")
println(s"rptdf show, ${rptdf.count()}")
rptdf.show()
rptdf.printSchema()
}
}
Spark | 2.3 (Hardcoded) |
Python | Python 2 or python 3 |
Only Spark 2.x is supported. The Python version does not matter.
# -*- coding: utf-8 -*-
import sys
from pyspark.sql import SparkSession
try:
# for python 2
reload(sys)
sys.setdefaultencoding('utf8')
except:
# python 3 not needed
pass
if __name__ == '__main__':
spark = SparkSession.builder\
.appName("spark sql")\
.config("spark.sql.broadcastTimeout", 20 * 60)\
.config("spark.sql.crossJoin.enabled", True)\
.getOrCreate()
tableName = "mc_test_table"
ptTableName = "mc_test_pt_table"
data = [i for i in range(0, 100)]
# Drop Create
spark.sql("DROP TABLE IF EXISTS %s" % tableName)
spark.sql("DROP TABLE IF EXISTS %s" % ptTableName)
spark.sql("CREATE TABLE %s (name STRING, num BIGINT)" % tableName)
spark.sql("CREATE TABLE %s (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)" % ptTableName)
df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s)).toDF("name: string, num: int")
pt_df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s, "2018", "0601")).toDF("name: string, num: int, pt1: string, pt2: string")
# write regular table
df.write.insertInto(tableName) # insertInto semantics
df.writeTo(tableName).overwritePartitions() # insertOverwrite use datasourcev2
# write partitioned table
# DataFrameWriter cannot specify partition writing, need to use temporary table and then write to a specific partition with SQL
df.createOrReplaceTempView("%s_tmp_view" % ptTableName)
spark.sql("insert into table %s partition (pt1='2018', pt2='0601') select * from %s_tmp_view" % (ptTableName, ptTableName))
spark.sql("insert overwrite table %s partition (pt1='2018', pt2='0601') select * from %s_tmp_view" % (ptTableName, ptTableName))
pt_df.write.insertInto(ptTableName) # dynamic partition insertInto semantics
pt_df.write.insertInto(ptTableName, True) # dynamic partition insertOverwrite semantics
# read regular table
rdf = spark.sql("select name, num from %s" % tableName)
print("rdf count, %s\n" % rdf.count())
rdf.printSchema()
# read partitioned table
rptdf = spark.sql("select name, num, pt1, pt2 from %s where pt1 = '2018' and pt2 = '0601'" % ptTableName)
print("rptdf count, %s" % (rptdf.count()))
rptdf.printSchema()
How to Load OSS gz Data into MaxCompute with DataWorks Scheduled Job
Use Dataworks with Python3 to Import Data from MaxCompute to Neo4j
Alibaba Cloud Indonesia - February 15, 2024
Alibaba Cloud Indonesia - February 19, 2024
Alibaba Cloud MaxCompute - June 2, 2021
Alibaba Cloud MaxCompute - March 3, 2020
- November 23, 2017
Alibaba Cloud MaxCompute - August 15, 2022
Alibaba Cloud provides big data consulting services to help enterprises leverage advanced data technology.
Learn MoreConduct large-scale data warehousing with MaxCompute
Learn MoreAlibaba Cloud experts provide retailers with a lightweight and customized big data consulting service to help you assess your big data maturity and plan your big data journey.
Learn MoreA secure environment for offline data development, with powerful Open APIs, to create an ecosystem for redevelopment.
Learn MoreMore Posts by Farruh