1

(edit: solr-6.6.0 on ubuntu)

I'm trying to use Solr's DataImportHandler to index a MySQL database, which includes blob entries with RTF files. For this I'm using the FieldStreamDataSource, as specified in these answers:

How do I index Rich Format Documents in Blobs

Unsupported type Exception on Importing Documents from Database

While all other non-Blob Fields get indexed, I keep getting the following Java Runtime error from FieldStreamDataSource.getData() method, as appears in the Solr log files:

Caused by: java.lang.RuntimeException: unsupported type : class java.lang.String at org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:77) at org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:47) at org.apache.solr.handler.dataimport.DebugLogger$2.getData(DebugLogger.java:187)

Which refers to these lines of the Java class (see arrows '<==='):

  @Override
  public InputStream getData(String query) {
    Object o = wrapper.getVariableResolver().resolve(dataField);
    if (o == null) {
      throw new DataImportHandlerException(SEVERE, "No field available for name : " + dataField);
    } else if (o instanceof Blob) {                                     // <========= XXX
      Blob blob = (Blob) o;
      try {
        return blob.getBinaryStream();
      } catch (SQLException sqle) {
        LOG.info("Unable to get data from BLOB");
        return null;
      }
    } else if (o instanceof byte[]) {
      byte[] bytes = (byte[]) o;
      return new ByteArrayInputStream(bytes);
    } else {
      throw new RuntimeException("unsupported type : " + o.getClass()); // <========= XXX
    } 

Which should mean that the getData() method is not getting a Blob type, but a String.

I know it has been suggested in many other threads that this error arises when the blob value is null in the database:

Unsupported type Exception on Importing Documents from Database

Solr DIH Throwing Error Unsupported Type Class Java.Lang.String

However, that is not the case here, as I have only 5 entries in this test DB, non of which has a null value. Also, I am checking the debug output from the Solr DIH interface, which shows that the Blob value was indeed retrieved from the DB (see arrows '<==='):

"verbose-output": [
        "entity:reports1",
        [
          "document#1",
          [
            "query",
            "SELECT id, institute, exam_date, age, acc, pacs FROM reports",
            "time-taken",
            "0:0:0.8",
            null,
            "----------- row #1-------------",
            "id",
            "1",
            "institute",
            "RADIOLOGY",
            "age",
            "68",
            "acc",
            "165184654",
            "pacs",
            "233215",
            "exame_date",
            "2016-02-05T00:00:00Z",
            null,
            "---------------------------------------------",
            "entity:reports2",
            [
              "query",
              "SELECT report FROM reports WHERE id='1'",
              "time-taken",
              "0:0:0.6",
              null,
              "----------- row #1-------------",
              "report",                   //   <=========   COLUMN NAME RETURNED FROM THE SQL SELECT
              "e1xydGYxXGFkZWZ[...]",    //   <=========   VALUE RETURNED FROM THE SQL SELECT
              null,
              "---------------------------------------------",
              "entity:report",
              [
                "query",
                "report",
                "EXCEPTION",              // <==========   EXCEPTION THROWN
                "java.lang.RuntimeException: unsupported type : class java.lang.String\n\tat org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:77)\n\tat org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:47)\n\tat org.apache.solr.handler.dataimport.DebugLogger$2.getData(DebugLogger.java:187)\n\tat org.apache.solr.handler.dataimport.TikaEntityProcessor.nextRow(TikaEntityProcessor.java:128)\n\tat org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:267)\n\tat org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:475)\n\tat org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:516)\n\tat org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:516)\n\tat org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:414)\n\tat org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:329)\n\tat org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:232)\n\tat org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:415)\n\tat org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:474)\n\tat org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:180)\n\tat org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:173)\n\tat org.apache.solr.core.SolrCore.execute(SolrCore.java:2477)\n\tat org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:723)\n\tat org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:529)\n\tat org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:361)\n\tat org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:305)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1691)\n\tat org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:582)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)\n\tat org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:548)\n\tat org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:226)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1180)\n\tat org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:512)\n\tat org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1112)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)\n\tat org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:213)\n\tat org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:119)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:134)\n\tat org.eclipse.jetty.rewrite.handler.RewriteHandler.handle(RewriteHandler.java:335)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:134)\n\tat org.eclipse.jetty.server.Server.handle(Server.java:534)\n\tat org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:320)\n\tat org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:251)\n\tat org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:273)\n\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:95)\n\tat org.eclipse.jetty.io.SelectChannelEndPoint$2.run(SelectChannelEndPoint.java:93)\n\tat org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.executeProduceConsume(ExecuteProduceConsume.java:303)\n\tat org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.produceConsume(ExecuteProduceConsume.java:148)\n\tat org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.run(ExecuteProduceConsume.java:136)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:671)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:589)\n\tat java.lang.Thread.run(Thread.java:748)\n",
                    "time-taken",
                    "0:0:0.0"
                  ]
                ]
              ],

Here is my data-config.xml

<dataConfig>

        <dataSource 
            name="db"
            type="JdbcDataSource" 
            driver="com.mysql.jdbc.Driver"
            url="jdbc:mysql://localhost:3306/RIS" 
            user="root" 
            password="********"/>

        <dataSource name="fieldStream" type="FieldStreamDataSource"/>

        <document>
            <entity
                name="reports1"
                query="SELECT id, institute, exam_date, age, acc, pacs FROM reports"
                dataSource="db"
            >
                <field column="id"           name="id"/>
                <field column="institute"    name="institute"/>
                <field column="exam_date"    name="exam_date"/>
                <field column="age"          name="age"/>
                <field column="acc"          name="acc"/>
                <field column="pacs"         name="pacs"/>


                <entity
                    name="reports2"
                    query="SELECT report FROM reports WHERE id='${reports1.id}'"
                    dataSource="db"
                >
                    <entity 
                        name="report" 
                        dataSource="fieldStream"
                        processor="TikaEntityProcessor"
                        url="report"
                        dataField="reports2.REPORT"
                        format="text"
                        onError="continue">
                        <field column="text" name="report"/>
                    </entity>
                </entity>

            </entity>

        </document>
</dataConfig>

So it appears to me that the value retrieved from the blob column of the DB is being passed or reckonized as a String, and not a Blob type. Can anyone help me see if I'm doing something wrong? I have searched everywhere and can't see the solution:/

Many thanks

drD
  • 11
  • 2

1 Answers1

0

I have found the (rather trivial) issue (even though I had tried for days)

Changing the dataField property to lowercase did the trick, contrary to the experience from this answer. Made the change below (see arrow '<===') to the data-config.xml, and all entries got indexed.

<entity
    name="reports2"
    query="SELECT report FROM reports WHERE id='${reports1.id}'"
    dataSource="db">
       <entity 
           name="report" 
           dataSource="fieldStream"
           processor="TikaEntityProcessor"
           url="report"
           dataField="reports2.report"  //    <===== instead of reports2.REPORT
           format="text"
           onError="continue">

           <field column="text" name="report"/>
       </entity>

However, this did give me an unrelated issue that might be worth pointing out (not sure if in another answer).

Because the column returned from the database ("report") had the same name as the Solr Schema Field ("report"), the indexing happened automaticaly after the SQL query, indexing the binary value from the BLOB. It ignored the text extracted in the following entity by the TikaEntityProcessor. I fixed this by making sure that the two fields (MySQL and Solr) had different names for this value.

Below, the arrows ('<===') show where the indexing happened (earlier than planned).

<entity
    name="reports2"
    query="SELECT report FROM reports WHERE id='${reports1.id}'" // <========  DIH indexed this column (report) from the DB into the Solr field of same name (binary representation)
    dataSource="db">
       <entity 
           name="report" 
           dataSource="fieldStream"
           processor="TikaEntityProcessor"
           url="report"
           dataField="reports2.report"
           format="text"
           onError="continue">

           <field column="text" name="report"/>  //<======== instead of waiting for this column (text), the output from Tika (extracted text)
       </entity>
</entity>
drD
  • 11
  • 2
  • "I fixed this by making sure that the two fields (MySQL and Solr) had different names for this value." Please explain, I'm having the same problem. – jim collins May 08 '19 at 20:15