I am using Glue to transfer data from a table in Glue Catalog to another table in RDS instance. Following is the code snippet that's used to connect to Glue catalog's table.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "dev", table_name = "tbl", transformation_ctx = "datasource0")
............
job.commit()
Please note that glue catalog table has data and even that was verified from Athena. But I am repeatedly getting below error.
File "script_2019-05-16-16-17-26.py", line 20, in <module>
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "dev", table_name = "tbl", transformation_ctx = "datasource0")
File "/mnt/yarn/usercache/root/appcache/application_1558022970835_0001/container_1558022970835_0001_01_000001/PyGlue.zip/awsglue/dynamicframe.py", line 570, in from_catalog
File "/mnt/yarn/usercache/root/appcache/application_1558022970835_0001/container_1558022970835_0001_01_000001/PyGlue.zip/awsglue/context.py", line 138, in create_dynamic_frame_from_catalog
File "/mnt/yarn/usercache/root/appcache/application_1558022970835_0001/container_1558022970835_0001_01_000001/PyGlue.zip/awsglue/data_source.py", line 36, in getFrame
File "/mnt/yarn/usercache/root/appcache/application_1558022970835_0001/container_1558022970835_0001_01_000001/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/mnt/yarn/usercache/root/appcache/application_1558022970835_0001/container_1558022970835_0001_01_000001/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/mnt/yarn/usercache/root/appcache/application_1558022970835_0001/container_1558022970835_0001_01_000001/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o63.getDynamicFrame.
: java.lang.IndexOutOfBoundsException
at java.nio.Buffer.checkIndex(Buffer.java:540)
at java.nio.HeapByteBuffer.get(HeapByteBuffer.java:139)
at org.apache.hadoop.hive.ql.io.orc.ReaderImpl.extractMetaInfoFromFooter(ReaderImpl.java:374)
at org.apache.hadoop.hive.ql.io.orc.ReaderImpl.<init>(ReaderImpl.java:316)
at org.apache.hadoop.hive.ql.io.orc.OrcFile.createReader(OrcFile.java:187)
at org.apache.spark.sql.hive.orc.OrcFileOperator$$anonfun$getFileReader$2.apply(OrcFileOperator.scala:68)
The IAM role of glue job has policies attached with S3FullAccess, GlueFullAccess and CloudWatchLogFullAccess