0

Here my code helps me to extract the data from .doc files into paragraphs and specific string search as well.i can take it manual output using eclipse run configuration. but 1) i wanted it to direct output into excel file where it is .doc file.2) output must be specified cells only.

public static void readParagraphs(HWPFDocument docx) throws Exception{
    we = new WordExtractor(docx);
    String[] paragraphs = we.getParagraphText();     
    // To fetch for mode
    for(String p: paragraphs){
    if(p.startsWith("MODE"))
    System.out.println("       "+p);
    }
    for(String type: paragraphs ){
    if(type.startsWith("TYPE"))
    System.out.format("       "+type);
    }
    }

Expected output:

S.no  | Doc name  | Title    | mode            | type
=====================================================================
1     | laptop    | A12345   | abcd 123456     | efghij A12345/123456
2     | laptop    | A12346   | abcd 123457     | efghij A12345/123457
3     | laptop    | A12347   | abcd 123458     | efghij A12345/123458

here you can see piece of my code of HSSFWorkbook.

HSSFWorkbook workbook = new HSSFWorkbook();
HSSFSheet sheet = workbook.createSheet("firstsheet");
Row row1 = sheet.createRow((short) 0);
row1.createCell(0).setCellValue("S.NO");
row1.createCell(1).setCellValue("DOC NAME");
row1.createCell(2).setCellValue("TITLE");
row1.createCell(3).setCellValue("MODE");
row1.createCell(4).setCellValue("TYPE");
Row row2 = sheet.createRow(rowNum++);
row2.createCell(3).setCellValue(" "+mode);
row2.createCell(4).setCellValue(" "+type);
Row row3 = sheet.createRow(rowNum++);
row3.createCell(3).setCellValue(" "+mode);
row3.createCell(4).setCellValue(" "+type);

below table available on sheet 1, header file. need to extract only ' A12345'

 =====================================
 |  xx |       A12345         |xx    |     
 =====================================

below table available either sheet 2 or 3-6. depends on each doc.

 --------------------------------------------.--------------------
|MODE :  Abcde 123456 efghit 234567  sddsldjf 232132             |
|----------------------------------------------------------------|  
|INFO   |TYPE : efghij A12345/123456 dsflsdjflsd B22323/&123456  |
|       |xxxxxxxxxxxxxxxxxalphanumericxxxxxxxxxxxxxxxxxxxxxxxxxx |
 -----------------------------------------------------------------

if(p.startsWith("MODE"))// this method helps to print the " MODE : Abcde 123456 efghit 234567 sddsldjf 232132 " if(type.startsWith("TYPE")) // this method helps to print "TYPE : efghij A12345/123456 dsflsdjflsd B22323/&123456 " but some docs there is no 'TYPE' so,i would choose two options either to find the next lines until 'JUSTIFICATION' from 'MODE' or pattern recognition to fetch 'TYPE' lines. seek for suggestions.

below table available after above table

 -----------------------------------------------------------
|JUSTIFICATION                                              |   
|-----------------------------------------------------------
|   |   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx    |
|   |   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx alphanumericxxx    |
 -----------------------------------------------------------


import java.io.FileOutputStream;
import java.io.IOException;
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.sl.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.DataFormat;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import java.util.ArrayList;
import java.util.List;
import java.io.*;
public class ReadDocFileFromJava {

    public static int test = 0;

    private static WordExtractor ex;

        public static void main(String[] args) throws IOException  {

            List<String> fileName=new ArrayList<String>();  


        fileName.add("C:\\1200.doc");
        fileName.add("C:\\1210.doc");
        fileName.add("C:\\1211.doc");
        fileName.add("C:\\1212.doc");
        fileName.add("C:\\1213.doc");
                // document 2 
        fileName.add("C:\\1214.doc");
        fileName.add("C:\\1215.doc");
        fileName.add("C:\\1216.doc");
        fileName.add("C:\\1217.doc");


        for(int i=0;i<fileName.size();i++){

               readMyDocument(fileName.get(i));
        }}

    public static void readMyDocument(String i){
        POIFSFileSystem fs = null;
        try {
            fs = new POIFSFileSystem(new FileInputStream(i));

            HWPFDocument docx = new HWPFDocument(fs);
            ex = new WordExtractor(docx);
           readParagraphs(docx);
           fs.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
public static void readParagraphs(HWPFDocument docx) throws Exception{

        int a =0,b=0,c=0,d=0,celIte=0, celIte2=0,link=0;
        ex = new WordExtractor(docx);
        String[] paragraphs = ex.getParagraphText();

        HSSFWorkbook workbook = new HSSFWorkbook();
        HSSFSheet sheet = workbook.createSheet("firstsheet");
        Row row0 = sheet.createRow(0);
        row0.createCell(0).setCellValue("S.NO");
        row0.createCell(1).setCellValue("DOC NAME");
        row0.createCell(2).setCellValue("TITLE");
        row0.createCell(3).setCellValue("MODE");
        row0.createCell(4).setCellValue("TYPE");
          for(int i=1;i<=10;i++){
                        Row row1 = sheet.createRow(i);
                for(int j=0;j<=0;j++){
                    Cell cell_10 =row1.createCell(j);
                    do{
                        cell_10.setCellValue(celIte);
                        celIte++;
                    }while(celIte<1);

                    for(int k=3;k<=3;k++){

                    Cell cell_12 = row1.createCell(k);
                    for(String p: paragraphs){


                          if(p.startsWith("MODE"))  
                         cell_12.setCellValue(""+p);
        }   }
      workbook.write(new FileOutputStream("C:\\output.xls"));

        workbook.close();

}
                }
        }}

S.NO  | DOC NAME | TITLE | MODE | TYPE
==========================================
1                          XXXX
2                          XXXX 
3                          XXXX
4                          XXXX
5                          XXXX
6                          XXXX
7                          XXXX
8                          XXXX
9                          XXXX
shravan
  • 23
  • 9
  • you have to make clearer what you are trying to achieve. 1.)`i wanted it to direct output into excel file where it is .doc file` what do you mean with that? 2.) `output must be specified cells only` how are we supposed to know which output and which cells? – XtremeBaumer Apr 26 '17 at 13:04
  • 1) when we run the code, it must create a excel file using FileOutputStreams rather than console output.2) pls ignore this now. i m not clear much. – shravan Apr 26 '17 at 13:14
  • yes. i want it in excel. I knew csv not supported by Apache POI. – shravan Apr 26 '17 at 13:48
  • sry that comment was wrong question. if you want to create an excel file use something like this`try (OutputStream out = Files.newOutputStream(new Path(), StandardOpenOption.CREATE_NEW)) { this.workBook.write(out); this.workBook.close(); return path.toFile(); }` have a [look here](https://poi.apache.org/spreadsheet/quick-guide.html) – XtremeBaumer Apr 26 '17 at 13:58
  • if(type.startsWith("TYPE")) result of this string method generated by the HWPFDocument on the console ouput. how can i call the reference variable/creating object from this string method into outputstreams. i did look the link, its fine for XSSF&HSSF. – shravan Apr 27 '17 at 11:28
  • you create a workbook, then a sheet. then you create a row and in this row you create a cell and then you call `cell.setValue();` and in this method call you have to enter a parameter and this parameter should be `" "+type` – XtremeBaumer Apr 27 '17 at 11:35
  • yeah, its works fine. also could you tell that how to iterate the loop for string methods, cause it prints the same value into another cell. where its prints correctly on console ouput. – shravan Apr 27 '17 at 14:55
  • can you show how it **should** look like when its printed in the excel sheet? – XtremeBaumer Apr 28 '17 at 06:21
  • heading 1 heading 2 heading 3 mode type – shravan Apr 28 '17 at 08:34
  • please add it to your question and format it and all some examples to the columns/rows – XtremeBaumer Apr 28 '17 at 08:56
  • how do you get `S.no | Doc name | Title` ? values? – XtremeBaumer Apr 28 '17 at 10:46
  • if(p.startsWith("MODE")) method i will get title, mode,type and 'doc name' i m looking for suggestions to extract the names & 's.no' – shravan Apr 28 '17 at 10:53
  • then please provide an example of the file you read and how currently (in syso) the string looks like – XtremeBaumer Apr 28 '17 at 11:03
  • please don't post links to files or pictures. as long as you **can** post it here, do it – XtremeBaumer May 02 '17 at 06:13
  • i edited on question, please check. – shravan May 02 '17 at 12:07
  • what you can try if you can read the files for 100% correct is to store the values which you instead would print in an POJO with several arrayLists for ever possible heading (S.no, Doc name, Title, mode, type). after reading all files, you simply write the pojo into the excel file by iterating over each list and setting the cell values properly – XtremeBaumer May 03 '17 at 07:33
  • except doc name rest 3 fields readable and writable. but doc name can print on console. could you please check the code. – shravan May 05 '17 at 10:59
  • i think i know why. you create a new workbook for every document you read and probably delete the previous. do you get all the information `S.NO | DOC NAME | TITLE | MODE | TYPE` from the files or are some selfmade? – XtremeBaumer May 05 '17 at 11:15

1 Answers1

0

modified your code a bit, but its surely not working yet as i am missing information. maybe you can try to code the rest

public class ReadDocFileFromJava {

    public static int test = 0;

    private static WordExtractor ex;

    private static List<String[]> allParagraphs;

    public static void main(String[] args) throws IOException {

        List<String> fileName = new ArrayList<String>();

        fileName.add("C:\\1200.doc");
        fileName.add("C:\\1210.doc");
        fileName.add("C:\\1211.doc");
        fileName.add("C:\\1212.doc");
        fileName.add("C:\\1213.doc");
        // document 2
        fileName.add("C:\\1214.doc");
        fileName.add("C:\\1215.doc");
        fileName.add("C:\\1216.doc");
        fileName.add("C:\\1217.doc");

        for (int i = 0; i < fileName.size(); i++) {
            allParagraphs.add(readMyDocument(fileName.get(i)));
        }

    }

    public static String[] readMyDocument(String i) {
        POIFSFileSystem fs = null;
        String[] paragraph;
        try {
            fs = new POIFSFileSystem(new FileInputStream(i));
            HWPFDocument docx = new HWPFDocument(fs);
            ex = new WordExtractor(docx);
            paragraph = ex.getParagraphText();
            fs.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return paragraph;

    }

    public static void readParagraphs(List<String[]> paragraphs) throws Exception {

        int a = 0, b = 0, c = 0, d = 0, celIte = 0, celIte2 = 0, link = 0;
        HSSFWorkbook workbook = new HSSFWorkbook();
        HSSFSheet sheet = workbook.createSheet("firstsheet");
        Row row = sheet.createRow(0);
        row.createCell(0).setCellValue("S.NO");
        row.createCell(1).setCellValue("DOC NAME");
        row.createCell(2).setCellValue("TITLE");
        row.createCell(3).setCellValue("MODE");
        row.createCell(4).setCellValue("TYPE");
        for (int i = 1; i <= 10; i++) {
            row = sheet.createRow(i);
            for (int j = 0; j <= 0; j++) {
                Cell cell_10 = row.createCell(j);
                do {
                    cell_10.setCellValue(celIte);
                    celIte++;
                } while (celIte < 1);

                for (int k = 3; k <= 3; k++) {

                    Cell cell_12 = row.createCell(k);
                    for (String p : paragraphs) {

                        if (p.startsWith("MODE"))
                            cell_12.setCellValue("" + p);
                    }
                }
                workbook.write(new FileOutputStream("C:\\output.xls"));

                workbook.close();

            }
        }
    }
}
XtremeBaumer
  • 6,275
  • 3
  • 19
  • 65