Discussion:
Fail to read parquet file with nested column
Prisdha Dharma
2018-11-06 21:55:38 UTC
Permalink
Hello,

The latest Apache Drill works fine with JDBC, JSON, CSV, and simple parquet files. However it fails to read parquet files with nested columns, such as the one with the following schema:

root
|-- pgid: binary (nullable = true)
|-- update: long (nullable = true)
|-- pid: integer (nullable = true)
|-- source: byte (nullable = true)
|-- creation: long (nullable = true)
|-- devices: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- os: string (nullable = true)
| | |-- kind: byte (nullable = true)
| | |-- maker: string (nullable = true)
| | |-- model: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- sdk: string (nullable = true)
|-- optout: struct (nullable = true)
| |-- status: boolean (nullable = true)
| |-- level: byte (nullable = true)
|-- inferences: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- attribute: integer (nullable = true)
| | |-- weight: byte (nullable = true)
| | |-- update: long (nullable = true)
| | |-- source: integer (nullable = true)
|-- languages: array (nullable = true)
| |-- element: string (containsNull = true)
|-- residence: struct (nullable = true)
| |-- position: struct (nullable = true)
| | |-- latitude: float (nullable = true)
| | |-- longitude: float (nullable = true)
| |-- timestamp: long (nullable = true)
| |-- address: string (nullable = true)
| |-- city: string (nullable = true)
| |-- state: string (nullable = true)
| |-- country: string (nullable = true)
| |-- zip: string (nullable = true)
| |-- ips: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- address: string (nullable = true)
| | | |-- frequency: integer (nullable = true)
| | | |-- timestamp: long (nullable = true)
| |-- tags: array (nullable = true)
| | |-- element: integer (containsNull = true)
| |-- score: byte (nullable = true)
| |-- marker: byte (nullable = true)
| |-- visits: integer (nullable = true)
| |-- works: integer (nullable = true)
| |-- worktime: long (nullable = true)
| |-- offworks: integer (nullable = true)
| |-- pastime: long (nullable = true)
| |-- source: byte (nullable = true)
| |-- update: long (nullable = true)
|-- workplace: struct (nullable = true)
| |-- position: struct (nullable = true)
| | |-- latitude: float (nullable = true)
| | |-- longitude: float (nullable = true)
| |-- timestamp: long (nullable = true)
| |-- address: string (nullable = true)
| |-- city: string (nullable = true)
| |-- state: string (nullable = true)
| |-- country: string (nullable = true)
| |-- zip: string (nullable = true)
| |-- ips: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- address: string (nullable = true)
| | | |-- frequency: integer (nullable = true)
| | | |-- timestamp: long (nullable = true)
| |-- tags: array (nullable = true)
| | |-- element: integer (containsNull = true)
| |-- score: byte (nullable = true)
| |-- marker: byte (nullable = true)
| |-- visits: integer (nullable = true)
| |-- works: integer (nullable = true)
| |-- worktime: long (nullable = true)
| |-- offworks: integer (nullable = true)
| |-- pastime: long (nullable = true)
| |-- source: byte (nullable = true)
| |-- update: long (nullable = true)
Drill fails with the following error:
0: jdbc:drill:zk=local> select * from profiles limit 10;
Error: INTERNAL_ERROR ERROR: Error in drill parquet reader (complex).
Message: Failure in setting up reader
Parquet Metadata: null
Fragment 0:0
Please, refer to logs for more information.
[Error Id: 83cf6c2d-29eb-4238-9093-f56086e492f9 on localhost:31010] (state=,code=0)


Could you please advise.


Prisdha Dharma
Kunal Khatua
2018-11-06 22:12:57 UTC
Permalink
Hi Prisdha

What do the logs say? Can you share the stack trace from the logs??

Kunal

On 11/6/2018 1:58:42 PM, Prisdha Dharma <***@intertrust.com> wrote:
Hello,

The latest Apache Drill works fine with JDBC, JSON, CSV, and simple parquet files. However it fails to read parquet files with nested columns, such as the one with the following schema:

root
|-- pgid: binary (nullable = true)
|-- update: long (nullable = true)
|-- pid: integer (nullable = true)
|-- source: byte (nullable = true)
|-- creation: long (nullable = true)
|-- devices: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- os: string (nullable = true)
| | |-- kind: byte (nullable = true)
| | |-- maker: string (nullable = true)
| | |-- model: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- sdk: string (nullable = true)
|-- optout: struct (nullable = true)
| |-- status: boolean (nullable = true)
| |-- level: byte (nullable = true)
|-- inferences: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- attribute: integer (nullable = true)
| | |-- weight: byte (nullable = true)
| | |-- update: long (nullable = true)
| | |-- source: integer (nullable = true)
|-- languages: array (nullable = true)
| |-- element: string (containsNull = true)
|-- residence: struct (nullable = true)
| |-- position: struct (nullable = true)
| | |-- latitude: float (nullable = true)
| | |-- longitude: float (nullable = true)
| |-- timestamp: long (nullable = true)
| |-- address: string (nullable = true)
| |-- city: string (nullable = true)
| |-- state: string (nullable = true)
| |-- country: string (nullable = true)
| |-- zip: string (nullable = true)
| |-- ips: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- address: string (nullable = true)
| | | |-- frequency: integer (nullable = true)
| | | |-- timestamp: long (nullable = true)
| |-- tags: array (nullable = true)
| | |-- element: integer (containsNull = true)
| |-- score: byte (nullable = true)
| |-- marker: byte (nullable = true)
| |-- visits: integer (nullable = true)
| |-- works: integer (nullable = true)
| |-- worktime: long (nullable = true)
| |-- offworks: integer (nullable = true)
| |-- pastime: long (nullable = true)
| |-- source: byte (nullable = true)
| |-- update: long (nullable = true)
|-- workplace: struct (nullable = true)
| |-- position: struct (nullable = true)
| | |-- latitude: float (nullable = true)
| | |-- longitude: float (nullable = true)
| |-- timestamp: long (nullable = true)
| |-- address: string (nullable = true)
| |-- city: string (nullable = true)
| |-- state: string (nullable = true)
| |-- country: string (nullable = true)
| |-- zip: string (nullable = true)
| |-- ips: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- address: string (nullable = true)
| | | |-- frequency: integer (nullable = true)
| | | |-- timestamp: long (nullable = true)
| |-- tags: array (nullable = true)
| | |-- element: integer (containsNull = true)
| |-- score: byte (nullable = true)
| |-- marker: byte (nullable = true)
| |-- visits: integer (nullable = true)
| |-- works: integer (nullable = true)
| |-- worktime: long (nullable = true)
| |-- offworks: integer (nullable = true)
| |-- pastime: long (nullable = true)
| |-- source: byte (nullable = true)
| |-- update: long (nullable = true)
Drill fails with the following error:
0: jdbc:drill:zk=local> select * from profiles limit 10;
Error: INTERNAL_ERROR ERROR: Error in drill parquet reader (complex).
Message: Failure in setting up reader
Parquet Metadata: null
Fragment 0:0
Please, refer to logs for more information.
[Error Id: 83cf6c2d-29eb-4238-9093-f56086e492f9 on localhost:31010] (state=,code=0)


Could you please advise.


Prisdha Dharma
Prisdha Dharma
2018-11-13 18:30:06 UTC
Permalink
Thank you for looking into this,



While I was trying to generate logs for this issue, it turn out that Drill can process Parquet files that I used earlier just fine and I can no longer reproduce this issue. I tried with both version 1.14.0 and 1.15.0-SNAPSHOT and they work fine now.

Regards,

Prisdha Dharma
________________________________
From: Kunal Khatua <***@apache.org>
Sent: Tuesday, November 6, 2018 2:12 PM
To: ***@drill.apache.org
Cc: Oleg MÃŒrk
Subject: Re: Fail to read parquet file with nested column

Hi Prisdha

What do the logs say? Can you share the stack trace from the logs??

Kunal

On 11/6/2018 1:58:42 PM, Prisdha Dharma <***@intertrust.com> wrote:
Hello,

The latest Apache Drill works fine with JDBC, JSON, CSV, and simple parquet files. However it fails to read parquet files with nested columns, such as the one with the following schema:

root
|-- pgid: binary (nullable = true)
|-- update: long (nullable = true)
|-- pid: integer (nullable = true)
|-- source: byte (nullable = true)
|-- creation: long (nullable = true)
|-- devices: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- os: string (nullable = true)
| | |-- kind: byte (nullable = true)
| | |-- maker: string (nullable = true)
| | |-- model: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- sdk: string (nullable = true)
|-- optout: struct (nullable = true)
| |-- status: boolean (nullable = true)
| |-- level: byte (nullable = true)
|-- inferences: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- attribute: integer (nullable = true)
| | |-- weight: byte (nullable = true)
| | |-- update: long (nullable = true)
| | |-- source: integer (nullable = true)
|-- languages: array (nullable = true)
| |-- element: string (containsNull = true)
|-- residence: struct (nullable = true)
| |-- position: struct (nullable = true)
| | |-- latitude: float (nullable = true)
| | |-- longitude: float (nullable = true)
| |-- timestamp: long (nullable = true)
| |-- address: string (nullable = true)
| |-- city: string (nullable = true)
| |-- state: string (nullable = true)
| |-- country: string (nullable = true)
| |-- zip: string (nullable = true)
| |-- ips: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- address: string (nullable = true)
| | | |-- frequency: integer (nullable = true)
| | | |-- timestamp: long (nullable = true)
| |-- tags: array (nullable = true)
| | |-- element: integer (containsNull = true)
| |-- score: byte (nullable = true)
| |-- marker: byte (nullable = true)
| |-- visits: integer (nullable = true)
| |-- works: integer (nullable = true)
| |-- worktime: long (nullable = true)
| |-- offworks: integer (nullable = true)
| |-- pastime: long (nullable = true)
| |-- source: byte (nullable = true)
| |-- update: long (nullable = true)
|-- workplace: struct (nullable = true)
| |-- position: struct (nullable = true)
| | |-- latitude: float (nullable = true)
| | |-- longitude: float (nullable = true)
| |-- timestamp: long (nullable = true)
| |-- address: string (nullable = true)
| |-- city: string (nullable = true)
| |-- state: string (nullable = true)
| |-- country: string (nullable = true)
| |-- zip: string (nullable = true)
| |-- ips: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- address: string (nullable = true)
| | | |-- frequency: integer (nullable = true)
| | | |-- timestamp: long (nullable = true)
| |-- tags: array (nullable = true)
| | |-- element: integer (containsNull = true)
| |-- score: byte (nullable = true)
| |-- marker: byte (nullable = true)
| |-- visits: integer (nullable = true)
| |-- works: integer (nullable = true)
| |-- worktime: long (nullable = true)
| |-- offworks: integer (nullable = true)
| |-- pastime: long (nullable = true)
| |-- source: byte (nullable = true)
| |-- update: long (nullable = true)
Drill fails with the following error:
0: jdbc:drill:zk=local> select * from profiles limit 10;
Error: INTERNAL_ERROR ERROR: Error in drill parquet reader (complex).
Message: Failure in setting up reader
Parquet Metadata: null
Fragment 0:0
Please, refer to logs for more information.
[Error Id: 83cf6c2d-29eb-4238-9093-f56086e492f9 on localhost:31010] (state=,code=0)


Could you please advise.


Prisdha Dharma

Loading...