spark 檔案建議使用spark-hadoop-cloud從https://spark.apache.org/docs/latest/cloud-integration.html 中的S3 讀取/寫入。
spark-hadoop-cloud 沒有 apache spark 發布的工件。然后在嘗試使用 Cloudera 發布的模塊時出現以下例外
Exception in thread "main" java.lang.NoSuchMethodError: 'void com.google.common.base.Preconditions.checkArgument(boolean, java.lang.String, java.lang.Object, java.lang.Object)'
at org.apache.hadoop.fs.s3a.S3AUtils.lookupPassword(S3AUtils.java:894)
at org.apache.hadoop.fs.s3a.S3AUtils.lookupPassword(S3AUtils.java:870)
at org.apache.hadoop.fs.s3a.S3AUtils.getEncryptionAlgorithm(S3AUtils.java:1605)
at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:363)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:46)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:377)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:325)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:307)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:307)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:519)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:428)
這似乎是類路徑沖突。然后似乎不可能使用 spark-hadoop-cloud 來讀取 vanilla apache spark 3.1.2 jars
version := "0.0.1"
scalaVersion := "2.12.12"
lazy val app = (project in file("app")).settings(
assemblyPackageScala / assembleArtifact := false,
assembly / assemblyJarName := "uber.jar",
assembly / mainClass := Some("com.example.Main"),
// more settings here ...
)
resolvers = "Cloudera" at "https://repository.cloudera.com/artifactory/cloudera-repos/"
libraryDependencies = "org.apache.spark" %% "spark-core" % "3.1.1" % "provided"
libraryDependencies = "org.apache.spark" %% "spark-sql" % "3.1.1" % "provided"
libraryDependencies = "org.apache.spark" %% "spark-hadoop-cloud" % "3.1.1.3.1.7270.0-253"
libraryDependencies = "org.apache.hadoop" % "hadoop-aws" % "3.1.1.7.2.7.0-184"
libraryDependencies = "org.apache.hadoop" % "hadoop-client" % "3.1.1.7.2.7.0-184"
libraryDependencies = "com.amazonaws" % "aws-java-sdk-bundle" % "1.11.901"
libraryDependencies = "com.github.mrpowers" %% "spark-daria" % "0.38.2"
libraryDependencies = "com.github.mrpowers" %% "spark-fast-tests" % "0.21.3" % "test"
libraryDependencies = "org.scalatest" %% "scalatest" % "3.0.1" % "test"
import org.apache.spark.sql.SparkSession
object SparkApp {
def main(args: Array[String]){
val spark = SparkSession.builder().master("local")
//.config("spark.jars.repositories", "https://repository.cloudera.com/artifactory/cloudera-repos/")
//.config("spark.jars.packages", "org.apache.spark:spark-hadoop-cloud_2.12:3.1.1.3.1.7270.0-253")
.appName("spark session").getOrCreate
val jsonDF = spark.read.json("s3a://path-to-bucket/compact.json")
val csvDF = spark.read.format("csv").load("s3a://path-to-bucket/some.csv")
jsonDF.show()
csvDF.show()
}
}
uj5u.com熱心網友回復:
要從 Spark 讀取和寫入 S3,您只需要以下 2 個依賴項:
"org.apache.hadoop" % "hadoop-aws" % hadoopVersion,
"org.apache.hadoop" % "hadoop-common" % hadoopVersion
確保您的作業節點使用的 haddopVersion 相同,并確保您的作業節點也有這些依賴項可用。您的其余代碼看起來是正確的。
轉載請註明出處,本文鏈接:https://www.uj5u.com/ruanti/312800.html
