I am trying to create a table via AWS Athena and getting the following error:
An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'OR', 'SCHEMA', 'TABLE', 'VIEW'
unable to rollback
Here is the statement it tries to execute which begets this error:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
incident_id string,
incident_date date,
state string,
city_or_county string,
address string,
n_killed int,
n_injured int,
incident_url string,
source_url string,
incident_url_fields_missing int string,
congressional_district int,
gun_stolen string,
gun_type string,
incident_characteristics string,
latitude float,
location_description string,
longitude float,
n_guns_involved int,
notes string,
participant_age string,
participant_age_group string,
participant_gender string,
participant_name string,
participant_relationship string,
participant_status string,
participant_type string,
sources string,
state_house_district int,
state_senate_district int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
""".format(
database_name, table_name_csv, s3_private_path_csv
)
import pandas as pd
pd.read_sql(statement, conn)
If I run a print(statement) you get this:
CREATE EXTERNAL TABLE IF NOT EXISTS dsoaws.gun_violence_csv2(
incident_id varchar(10),
incident_date date,
state varchar(25),
city_or_county varchar(25),
address string,
n_killed int,
n_injured int,
incident_url string,
source_url string,
incident_url_fields_missing int string,
congressional_district int,
gun_stolen string,
gun_type string,
incident_characteristics string,
latitude float,
location_description string,
longitude float,
n_guns_involved int,
notes string,
participant_age string,
participant_age_group string,
participant_gender string,
participant_name string,
participant_relationship string,
participant_status string,
participant_type string,
sources string,
state_house_district int,
state_senate_district int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-studio-898900188658-v9o4cft4fj/gun-violence'
TBLPROPERTIES ('skip.header.line.count'='1')
I am pretty sure it has to do with the column types being declared. Per the documentation, it doesn't seem to me that the float type isn't allowed. (https://docs.aws.amazon.com/athena/latest/ug/data-types.html)
Per Athena documentation, the dates are in the right format YYYY-MM-DD.
Is there any problem with the fields being set to string? I used it as a general one just to try to get the table working.
I have also checked other stackoverflow articles and ensured that the database name and table name are not using any hyphens.
db name: dsoaws
table name: gun_violence_csv
As reference, the dataset is available here: https://github.com/jamesqo/gun-violence-data https://github.com/jamesqo/gun-violence-data/blob/master/DATA_01-2013_03-2018.tar.gz?raw=true
UPDATED here is the query result:
Failed to execute query.
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/pyathena/common.py", line 305, in _execute
**request
File "/opt/conda/lib/python3.7/site-packages/pyathena/util.py", line 84, in retry_api_call
return retry(func, *args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 379, in __call__
do = self.iter(retry_state=retry_state)
File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 314, in iter
return fut.result()
File "/opt/conda/lib/python3.7/concurrent/futures/_base.py", line 428, in result
return self.__get_result()
File "/opt/conda/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
raise self._exception
File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 382, in __call__
result = fn(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/botocore/client.py", line 357, in _api_call
return self._make_api_call(operation_name, kwargs)
File "/opt/conda/lib/python3.7/site-packages/botocore/client.py", line 676, in _make_api_call
raise error_class(parsed_response, operation_name)
botocore.errorfactory.InvalidRequestException: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'OR', 'SCHEMA', 'TABLE', 'VIEW'
---------------------------------------------------------------------------
InvalidRequestException Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/pyathena/common.py in _execute(self, operation, parameters, work_group, s3_staging_dir, cache_size, cache_expiration_time)
304 logger=_logger,
--> 305 **request
306 ).get("QueryExecutionId", None)
/opt/conda/lib/python3.7/site-packages/pyathena/util.py in retry_api_call(func, config, logger, *args, **kwargs)
83 )
---> 84 return retry(func, *args, **kwargs)
/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py in __call__(self, fn, *args, **kwargs)
378 while True:
--> 379 do = self.iter(retry_state=retry_state)
380 if isinstance(do, DoAttempt):
/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py in iter(self, retry_state)
313 if not (is_explicit_retry or self.retry(retry_state)):
--> 314 return fut.result()
315
/opt/conda/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
427 elif self._state == FINISHED:
--> 428 return self.__get_result()
429
/opt/conda/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
383 if self._exception:
--> 384 raise self._exception
385 else:
/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py in __call__(self, fn, *args, **kwargs)
381 try:
--> 382 result = fn(*args, **kwargs)
383 except BaseException: # noqa: B902
/opt/conda/lib/python3.7/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
356 # The "self" in this scope is referring to the BaseClient.
--> 357 return self._make_api_call(operation_name, kwargs)
358
/opt/conda/lib/python3.7/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
675 error_class = self.exceptions.from_code(error_code)
--> 676 raise error_class(parsed_response, operation_name)
677 else:
InvalidRequestException: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'OR', 'SCHEMA', 'TABLE', 'VIEW'
The above exception was the direct cause of the following exception:
DatabaseError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs)
1696 try:
-> 1697 cur.execute(*args, **kwargs)
1698 return cur
/opt/conda/lib/python3.7/site-packages/pyathena/util.py in _wrapper(*args, **kwargs)
36 with _lock:
---> 37 return wrapped(*args, **kwargs)
38
/opt/conda/lib/python3.7/site-packages/pyathena/cursor.py in execute(self, operation, parameters, work_group, s3_staging_dir, cache_size, cache_expiration_time)
93 cache_size=cache_size,
---> 94 cache_expiration_time=cache_expiration_time,
95 )
/opt/conda/lib/python3.7/site-packages/pyathena/common.py in _execute(self, operation, parameters, work_group, s3_staging_dir, cache_size, cache_expiration_time)
308 _logger.exception("Failed to execute query.")
--> 309 raise DatabaseError(*e.args) from e
310 return query_id
DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'OR', 'SCHEMA', 'TABLE', 'VIEW'
During handling of the above exception, another exception occurred:
NotSupportedError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs)
1700 try:
-> 1701 self.con.rollback()
1702 except Exception as inner_exc: # pragma: no cover
/opt/conda/lib/python3.7/site-packages/pyathena/connection.py in rollback(self)
241 def rollback(self) -> None:
--> 242 raise NotSupportedError
NotSupportedError:
The above exception was the direct cause of the following exception:
DatabaseError Traceback (most recent call last)
<ipython-input-35-098d263441f4> in <module>
1 import pandas as pd
2
----> 3 pd.read_sql(statement, conn)
/opt/conda/lib/python3.7/site-packages/pandas/io/sql.py in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize)
488 coerce_float=coerce_float,
489 parse_dates=parse_dates,
--> 490 chunksize=chunksize,
491 )
492
/opt/conda/lib/python3.7/site-packages/pandas/io/sql.py in read_query(self, sql, index_col, coerce_float, params, parse_dates, chunksize)
1741
1742 args = _convert_params(sql, params)
-> 1743 cursor = self.execute(*args)
1744 columns = [col_desc[0] for col_desc in cursor.description]
1745
/opt/conda/lib/python3.7/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs)
1704 f"Execution failed on sql: {args[0]}\n{exc}\nunable to rollback"
1705 )
-> 1706 raise ex from inner_exc
1707
1708 ex = DatabaseError(f"Execution failed on sql '{args[0]}': {exc}")
DatabaseError: Execution failed on sql: CREATE EXTERNAL TABLE IF NOT EXISTS dsoaws.gun_violence_csv2(
incident_id varchar(10),
incident_date date,
state varchar(25),
city_or_county varchar(25),
address string,
n_killed int,
n_injured int,
incident_url string,
source_url string,
incident_url_fields_missing int string,
congressional_district int,
gun_stolen string,
gun_type string,
incident_characteristics string,
latitude float,
location_description string,
longitude float,
n_guns_involved int,
notes string,
participant_age string,
participant_age_group string,
participant_gender string,
participant_name string,
participant_relationship string,
participant_status string,
participant_type string,
sources string,
state_house_district int,
state_senate_district int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-studio-898900188658-v9o4cft4fj/gun-violence'
TBLPROPERTIES ('skip.header.line.count'='1')
An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'OR', 'SCHEMA', 'TABLE', 'VIEW'
unable to rollback