2

I am trying to get the html of flipkart website search by using the JavaFX frameweork. The below code works well by getting me the html output though with a small glitch. BTW I am using this generated html to scrape through the flipkart website. The problem is that the jpeg file corresponding to the product is not present in the image tag for most of the products in the html page. I observe that for any product the top 3 to 6 products have the image tag listed, otherwise it is empty.

The ones that had the tag is as follows

<DIV class="_3BTv9X" style="height: 240px; width:200px;">
    <IMG class="_1Nyybr _30XEf0" alt="" src="https://rukminim1.flixcart.com/image/312/312/mobile/d/f/w/motorola-moto-e3-power-pa4c0009in-original-imaemj7xpcfhnu8r.jpeg?q=70"/>
</DIV>

and the ones without the tag is as follows

<DIV class="_3BTv9X" style="height: 240px; width: 200px;">
    <IMG class="_1Nyybr" alt=""/>
</DIV>

Below is the Java program that generates the html. I would like somebody to help me figure out as to why the image tags are empty and how to get the html page with proper img tags.

public class Main extends Application {

    @Override
    public void start(Stage stage) throws Exception {
    stage.setTitle("HTML");
    stage.setWidth(500);
    stage.setHeight(500);
    //Scene scene = new Scene(new Group());
    //VBox root = new VBox();    
    final WebView browser = new WebView();
    final WebEngine webEngine = browser.getEngine();

    webEngine.load("https://www.flipkart.com/search?q=Motorola&otracker=start&as-show=on&as=off");
    webEngine.getLoadWorker().stateProperty().addListener((observable, oldState, newState) -> {
            if (newState ==  Worker.State.SUCCEEDED) {
                try {
                    ByteArrayOutputStream b = new ByteArrayOutputStream();
                    printDocument(webEngine.getDocument(), b);
                    System.out.println(b.toString());
                    //FlipkartScrape(b.toString());
                    Platform.exit();
                } catch(Exception e) {
                    System.out.println("Caught Exception");
                    Platform.exit();
                }
            }
            });

    //webEngine.load(null);
    //Hyperlink hpl = new Hyperlink("https://www.flipkart.com");

    //root.getChildren().addAll(hpl,browser);
    //scene.setRoot(root);

    //stage.setScene(scene);
    //stage.show();
    }

    public static void printDocument(Document doc, OutputStream out) throws IOException, TransformerException {
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer transformer = tf.newTransformer();
        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
        transformer.setOutputProperty(OutputKeys.METHOD, "xml");
        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
        transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");

        transformer.transform(new DOMSource(doc), 
                new StreamResult(new OutputStreamWriter(out, "UTF-8")));
    }

    public static void main(String[] args) {
    launch(args);
    }

    public void FlipkartScrape(String html){
    org.jsoup.nodes.Document doc = Jsoup.parse(html);
    Elements a = doc.select("a[title]");
    for (Element next: a) {
        Element e;
        String title = next.attr("title");
        String href = next.attr("href");

        href.replaceAll("/", "\\\\/");
        if ((e = next.nextElementSibling()) != null) {
            e = e.nextElementSibling();
            if (e == null)
                continue;
            e = e.nextElementSibling();
            if (e == null)
                continue;
        } else
            continue;

        if (href.equalsIgnoreCase(e.attr("href"))) {
            href = "http://www.flipkart.com" + href;
            System.out.println(title);
            System.out.println(e.text());
        } else {
            e = e.nextElementSibling();
            if (e == null) continue;

            System.out.println(title);
            href = "http://www.flipkart.com" + href;
            System.out.println("TEXT"+e.text());
        }

        Element parent = next.parent();
        if (parent != null) {
            parent = parent.parent();
            if (parent == null) continue;
        } else {
            continue;
        }

        e = parent.nextElementSibling();
        if (e != null) {
            Elements imgs = e.select("img[class]");
            for (Element img: imgs) {
                String imghref = img.attr("src");
                System.out.println("IMAGEHREF"+imghref);
            }
        }
    }
    }
}
  • Forgot to mention I have already tried PhantomJs but seems not to work for flipkart.com, hence need to somehow make the above java code to work. –  Oct 21 '16 at 16:04

0 Answers0