My project parse xml file from dblp , it about 1Gb to save in ORM database , so i use SAXP for reading imformation like (paper'author, paper'infomation ......) and use Hibeanate to save data in to database :
Step 1: Use SAXP read a element like this:
<article key="journals/cs/BhaskarAS11" mdate="2011-11-09"><author>Data Ram Bhaskar</author><author>Kasim Karam Abdalla</author><author>Raj Senani</author><title>Electronically-Controlled Current-mode second order Sinusoidal Oscillators Using MO-OTAs and Grounded Capacitors.</title><pages>65-73</pages><year>2011</year><volume>2</volume><journal>Circuits and Systems</journal><number>2</number><ee>http://dx.doi.org/10.4236/cs.2011.22011</ee><url>db/journals/cs/cs2.html#BhaskarAS11</url></article>
To get : Author'name , publicaiton, publisher, ... of publication
Step 2: Check deplicate Author, publisher, ... in database . Step 3: Save publication in database End: go to next element until end database.
When i run file dblp about 10MB it run ok (11 minute) - it input about 21000 publicaiton in to database. But when run full dblp file (1GB) , when a saw in database it about 75000 pub (about 1 day) the Netbean throw not respond - i can't get any error or message from netBean - and it ca't input any pub in to database (but in dblp have about 1,7 milion pub).
Sorry for the long question, but I'd very much appreciate any help any of you can give me! the problem SAXP ? or Hibernate ....?
Here some code :
Hibernate config:
<hibernate-configuration>
<session-factory>
<property name="hibernate.dialect">org.hibernate.dialect.MySQLDialect</property>
<property name="hibernate.connection.driver_class">com.mysql.jdbc.Driver</property>
<property name="hibernate.connection.url">jdbc:mysql://localhost:3306/cspublicationcrawler</property>
<property name="hibernate.connection.username">root</property>
<property name="hibernate.connection.password">root</property>
<property name="hibernate.transaction.auto_close_session">false</property>
<property name="hibernate.transaction.flush_before_completion">true</property>
<property name="hibernate.connection.pool_size">50</property>
<property name ="hibernate.jdbc.batch_size">500</property>
<property name="hibernate.current_session_context_class">org.hibernate.context.ThreadLocalSessionContext</property>
<mapping resource="uit/tkorg/cspublicationtool/entities/Magazine.hbm.xml"/>
<mapping resource="uit/tkorg/cspublicationtool/entities/Reviewer.hbm.xml"/>
<mapping resource="uit/tkorg/cspublicationtool/entities/Conference.hbm.xml"/>
<mapping resource="uit/tkorg/cspublicationtool/entities/Author.hbm.xml"/>
<mapping resource="uit/tkorg/cspublicationtool/entities/Paper.hbm.xml"/>
<mapping resource="uit/tkorg/cspublicationtool/entities/RankAuthorKeyword.hbm.xml"/>
<mapping resource="uit/tkorg/cspublicationtool/entities/Publisher.hbm.xml"/>
<mapping resource="uit/tkorg/cspublicationtool/entities/Comment.hbm.xml"/>
<mapping resource="uit/tkorg/cspublicationtool/entities/Domain.hbm.xml"/>
<mapping resource="uit/tkorg/cspublicationtool/entities/Journal.hbm.xml"/>
<mapping resource="uit/tkorg/cspublicationtool/entities/PaperType.hbm.xml"/>
</session-factory>
Hibernate Util
public HibernateUtil() throws Exception {
sessionFactoryConfigPath = "";
sessionFactory = new Configuration().configure().buildSessionFactory();
}
public HibernateUtil(String sessionFactoryConfigPath) {
this.sessionFactoryConfigPath = sessionFactoryConfigPath;
sessionFactory = new Configuration().configure(sessionFactoryConfigPath).buildSessionFactory();
}
/**
* Begin a transaction
*/
protected void beginTransaction() {
session = sessionFactory.getCurrentSession();
session.beginTransaction();
}
/**
* Commit transaction and close session
*/
protected void commitAndClose() {
if (session != null) {
for(int i=0;i<10000;i++) {
if ( i % 500 == 0 ) { //50, same as the JDBC batch size
//flush a batch of inserts and release memory:
session.flush();
session.clear();
}
}
session.getTransaction().commit();
if (session.isOpen()) {
session.close();
}
}
}
SAXP
public void endElement(String uri, String localName, String qName) throws SAXException {
try {
if(!recordTag.equals(WWW)&&!recordTag.equals(PROCEEDINGS)){
super.endElement(uri, localName, qName);
if(this.str != null){
this.value = this.str.toString();
}
if (qName.equals(AUTHOR) || qName.equals(EDITOR)) {
try {
String temp = value.replaceAll("'","");
author = this.authorBO.checkExitAuthor(temp);
if (author ==null)
{
author = new Author();
author.setAuthorName(value);
authorBO.addNew(author);
}
authors.add(author);
author=null;
return;
} catch (Exception ex) {
Logger.getLogger(CSPublicationSAXEventHandler.class.getName()).log(Level.SEVERE, null, ex);
}
}
if(qName.equals(TITLE)){
this.paper.setTitle(value);
return;
}
if(qName.equals(BOOKTITLE)){
if(recordTag.equals(INPROCEEDINGS)){
String temp = value.replaceAll("'","");
conference = conferenceBO.checkExitConference(temp);
if(conference == null)
{
conference = new Conference();
conference.setConferenceName(value);
conferenceBO.addNew(conference);
this.paper.setConference(conference);
conference=null;
return;
}
}else {
this.paper.setTitle(value);
return;
}
}
if(qName.equals(PAGES)){
this.paper.setPages(value);
return;
}
if(qName.equals(YEAR)){
this.paper.setYear(Integer.parseInt(value));
return;
}
if(qName.equals(ADDRESS)){
this.paper.setAdress(value);
return;
}
if(qName.equals(JOURNAL)){
try {
String temp = value.replaceAll("'","");
journal = this.journalBO.checkExitJournal(temp);
if (journal ==null)
{
journal = new Journal();
journal.setJournalName(value);
journalBO.addNew(journal);
}
this.paper.setJournal(journal);
journal =null;
return;
} catch (Exception ex) {
Logger.getLogger(CSPublicationSAXEventHandler.class.getName()).log(Level.SEVERE, null, ex);
}
}
if(qName.equals(VOLUME)){
this.paper.setVolume(value);
return;
}
if(qName.equals(NUMBER)){
this.paper.setNumber(value);
return;
}
if(qName.equals(MONTH)){
this.paper.setMonth(value);
return;
}
if(qName.equals(URL)){
this.paper.setUrl(value);
return;
}
if(qName.equals(EE)){
this.paper.setEe(value);
return;
}
if(qName.equals(CDROM)){
this.paper.setCdrom(value);
return;
}
if(qName.equals(CITE)){
this.paper.setCite(value);
return;
}
if(qName.equals(PUBLISHER)){
try {
String temp = value.replaceAll("'","");
publisher =publisherBO.checkExitPublisher(temp);
if (publisher ==null)
{
publisher = new Publisher();
publisher.setNamePublisher(value);
publisherBO.addNew(publisher);
}
this.paper.setPublisher(publisher);
publisher=null;
return;
} catch (Exception ex) {
Logger.getLogger(CSPublicationSAXEventHandler.class.getName()).log(Level.SEVERE, null, ex);
}
}
if(qName.equals(CROSSREF)){
this.paper.setCrossref(value);
return;
}
if(qName.equals(ISBN)){
this.paper.setIsbn(value);
return;
}
if(qName.equals(SERIES)){
this.paper.setSeries(value);
return;
}
if(qName.equals(SCHOOL)){
this.paper.setSchool(value);
return;
}
if(qName.equals(CHAPTER)){
this.paper.setChapter(value);
return;
}
if (qName.equals(recordTag)) {
this.paper.setAuthors(authors);
this.paperBO.addNew(paper);
if(this.authors != null){
this.authors = null;
}
if(this.paper != null){
this.paper = null;
}
if(this.str != null){
this.str = null;
}
return;
}
}
} catch (Exception ex) {
Logger.getLogger(CSPublicationSAXEventHandler.class.getName()).log(Level.SEVERE, null, ex);
}
}
@Override
public void endDocument() throws SAXException {
super.endDocument();
}
@Override
public void startDocument() throws SAXException {
super.startDocument();
try {
str = new StringBuffer();
this.authorBO = AuthorBO.getAuthorBO();
this.conferenceBO = ConferenceBO.getConferenceBO();
this.journalBO = JournalBO.getJournalBO();
this.publisherBO = PublisherBO.getPublisherBO();
this.paperTypeBO = PaperTypeBO.getPaperTypeBO();
this.paperBO = PaperBO.getPaperBO();
} catch (Exception ex) {
Logger.getLogger(CSPublicationSAXEventHandler.class.getName()).log(Level.SEVERE, null, ex);
}
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
super.startElement(uri, localName, qName, attributes);
this.str = new StringBuffer();
if ((attributes.getLength()>0) && (attributes.getValue("key")!=null)) {
recordTag = qName;
this.paper = new Paper();
this.authors = new HashSet <Author>();
this.paper.setDblpKey(attributes.getValue("key"));
if(!recordTag.equals(WWW)&&!recordTag.equals(PROCEEDINGS))
{
papertype = this.paperTypeBO.checkExitPaperType(qName);
if (papertype ==null)
{
try {
papertype = new PaperType();
papertype.setNameType(qName);
paperTypeBO.addNew(papertype);
} catch (Exception ex) {
Logger.getLogger(CSPublicationSAXEventHandler.class.getName()).log(Level.SEVERE, null, ex);
}
}
this.paper.setPaperType(papertype);
papertype=null;
}
return;
}
}