none
PySpark error -- No FileSystem for scheme: adl

    Question

  • I'm running PySpark on my local machine and trying to pull data from an Azure Data Lake Hive table. But, I'm getting the error below. I have already copied the hive-site.xml from the Azure node Hive2 conf folder to my own spark-2.3 conf folder. 

    There is an installation of PySpark on the Azure node, and that works fine with this code, but the PySpark on my local machine is giving the filesystem adl error below. I haven't been able to find any solutions for this...

    The code: 

    SparkContext.setSystemProperty("hive.metastore.uris", "thrift://<redacted>")
    
    conf = SparkConf().setAll([
    	('spark.driver.memory', '8g'), 
    	('spark.sql.autoBroadcastJoinThreshold', -1),
    	('spark.sql.broadcastTimeout', 1200),
    	('spark.debug.maxToStringFields', 1000)])
     
    sparkSession = (SparkSession
    	.builder
    	.appName('azure-connect')
    	.config(conf=conf)
    	.enableHiveSupport()
    	.getOrCreate())
    
    simple_query = """select stars_mcom_product_snap.product_type_id from qa_raw_prod.stars_mcom_product_snap 
    where stars_mcom_product_snap.product_type_id is not null and stars_mcom_product_snap.product_type_id != 'NaN' limit 20"""
    
    sparkSession.sql(simple_query).show()


    2018-06-22 13:41:06 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    Setting default log level to "WARN".
    To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
    ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/Users/m057388/Documents/spark-2.3.1-bin-hadoop2.7/conf/ivysettings.xml will be used
    Traceback (most recent call last):
      File "azure_pyspark_test.py", line 44, in <module>
        sparkSession.sql(simple_query).show()
      File "/usr/local/lib/python3.6/site-packages/pyspark/sql/dataframe.py", line 350, in show
        print(self._jdf.showString(n, 20, vertical))
      File "/usr/local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
        answer, self.gateway_client, self.target_id, self.name)
      File "/usr/local/lib/python3.6/site-packages/pyspark/sql/utils.py", line 63, in deco
        return f(*a, **kw)
      File "/usr/local/lib/python3.6/site-packages/py4j/protocol.py", line 328, in get_return_value
        format(target_id, ".", name), value)
    py4j.protocol.Py4JJavaError: An error occurred while calling o85.showString.
    : org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
    Exchange SinglePartition
    +- *(1) LocalLimit 20
       +- *(1) Filter (isnotnull(product_type_id#16) && null)
          +- HiveTableScan [product_type_id#16], HiveTableRelation `qa_raw_prod`.`stars_mcom_product_snap`, org.apache.hadoop.hive.ql.io.orc.OrcSerde, [tenant_id#0, product_id#1L, product_desc#2, start_date#3, end_date#4, first_live_date#5, master_member_flag#6, gift_wrap_id#7, giftwrappable_flag#8, live_flag#9, archive_flag#10, base_fee_exempt_flag#11, surcharge_fee#12, cadence_code#13, mdse_dept_nbr#14, mdse_dept_vendor_nbr#15, product_type_id#16, subclass_code#17, mdse_class_nbr#18, mdse_divn_nbr#19, project_id#20, tax_code#21, home_category_id#22, primary_image_id#23, context_id#24, final_approval_flag#25, ext_host_url#26, d2c_flag#27, new_display_flag#28, created_ts#29, updated_ts#30, publish_flag#31, atomic_valid_flag#32, daas_load_id#33, daas_crt_ts#34, daas_upd_ts#35, daas_corrltn_id#36]
    
    	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
    	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119)
    	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
    	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
    	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
    	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
    	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
    	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:371)
    	at org.apache.spark.sql.execution.BaseLimitExec$class.inputRDDs(limit.scala:62)
    	at org.apache.spark.sql.execution.GlobalLimitExec.inputRDDs(limit.scala:107)
    	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:41)
    	at org.apache.spark.sql.execution.BaseLimitExec$class.inputRDDs(limit.scala:62)
    	at org.apache.spark.sql.execution.LocalLimitExec.inputRDDs(limit.scala:97)
    	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:605)
    	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
    	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
    	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
    	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
    	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
    	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
    	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:337)
    	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
    	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3273)
    	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
    	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
    	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3254)
    	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
    	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3253)
    	at org.apache.spark.sql.Dataset.head(Dataset.scala:2484)
    	at org.apache.spark.sql.Dataset.take(Dataset.scala:2698)
    	at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
    	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    	at java.lang.reflect.Method.invoke(Method.java:498)
    	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    	at py4j.Gateway.invoke(Gateway.java:282)
    	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    	at py4j.commands.CallCommand.execute(CallCommand.java:79)
    	at py4j.GatewayConnection.run(GatewayConnection.java:238)
    	at java.lang.Thread.run(Thread.java:748)
    Caused by: java.io.IOException: No FileSystem for scheme: adl
    	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
    	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
    	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
    	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
    	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
    	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
    	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
    	at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:990)
    	at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getSplits(OrcInputFormat.java:1048)
    	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:200)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
    	at scala.Option.getOrElse(Option.scala:121)
    	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
    	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
    	at scala.Option.getOrElse(Option.scala:121)
    	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
    	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
    	at scala.Option.getOrElse(Option.scala:121)
    	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
    	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
    	at scala.Option.getOrElse(Option.scala:121)
    	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
    	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
    	at scala.Option.getOrElse(Option.scala:121)
    	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
    	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
    	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
    	at scala.Option.getOrElse(Option.scala:121)
    	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
    	at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91)
    	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.prepareShuffleDependency(ShuffleExchangeExec.scala:318)
    	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:91)
    	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128)
    	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119)
    	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
    	... 43 more
    
    


    • Edited by lucyktan Friday, June 22, 2018 8:49 PM
    Friday, June 22, 2018 8:48 PM

All replies