0

I use serde to read data with specific format with delimiter |

One line of my data may looks like: key1=value2|key2=value2|key3="va , lues", and I create the hive table as below:

CREATE EXTERNAL TABLE(
field1 STRING,
field2 STRING,
field3 STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  "input.regex" = "([^\\|]*)\\|([^\\|]*)\\|([^\\|]*)",
  "output.format.string" = "%1$s %2$s %3$s"
)
STORED AS TEXTFILE;

I need to extract all values, ignore all quotas if they exist. Result looks like a

 value2  value2 va , lues

How can I change my current regexp for extractig values ?

dtolnay
  • 9,621
  • 5
  • 41
  • 62
rmnvnv
  • 1

1 Answers1

0

I can currently offer 2 options, none of them is perfect.
BTW, "output.format.string" is obsolete and has no effect.

1

create external table mytable
(
    q1          string    
   ,field1      string
   ,q2          string
   ,field2      string
   ,q3          string
   ,field3      string
)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties ('input.regex' = '.*?=(?<q1>"?)(.*?)(?:\\k<q1>)\\|.*?=(?<q2>"?)(.*?)(?:\\k<q2>)\\|.*?=(?<q3>"?)(.*?)(?:\\k<q3>)')
stored as textfile
;

select * from mytable
;

+----+--------+----+--------+----+-----------+
| q1 | field1 | q2 | field2 | q3 |  field3   |
+----+--------+----+--------+----+-----------+
|    | value2 |    | value2 | "  | va , lues |
+----+--------+----+--------+----+-----------+

2

create external table mytable
(
    field1 string
   ,field2 string
   ,field3 string
)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties ('input.regex' = '.*?=(".*?"|.*?)\\|.*?=(".*?"|.*?)\\|.*?=(".*?"|.*?)')
stored as textfile
;

select * from mytable
;

+--------+--------+-------------+
| field1 | field2 |   field3    |
+--------+--------+-------------+
| value2 | value2 | "va , lues" |
+--------+--------+-------------+
David דודו Markovitz
  • 42,900
  • 6
  • 64
  • 88