Scraping Data from a Website using JSoup (Lightly Seasoned with SQLite)

Here’s some Java code I wrote to scrape links from a site for insertion into a
SQLite database. I use JSoup to do the scraping. It uses JQuery selectors
to select content to scrape. I think I prefer XPath, but it’s good to
have another tool in the toolbox. I use a Google Chrome extension called
Scraper to help me form the JQuery selectors. In this example, I’m using
the SQLite FTS4 Fulltext Search extension to make the values easily
searchable in the DB.

package com.sample;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *
 * @author Oscar Grouch
 */
public class SampleScraper {

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) 
    {
        /* Pull the all posts page */
        Document doc = null;
        try 
        {
            doc = Jsoup.connect("http://localhost/all.html").get();
        } 
        catch (IOException ex) 
        {
            Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex);
        }

        /* Get a list of all of the links on the all posts page */
        Elements links = doc.select("h2 > a");
        if (links != null)
        {
            /* Initialize DB driver. */
            try 
            {
                /* load the sqlite-JDBC driver using the current class loader */
                Class.forName("org.sqlite.JDBC");
            } 
            catch (ClassNotFoundException ex) 
            {
                System.err.println("Class not found: org.sqlite.JDBC: " + ex.getMessage());
            }

            Connection connection;
            try
            {
                /* create a database connection */
                connection = DriverManager.getConnection("jdbc:sqlite:C:/Temp/sample.db");
                Statement statement = connection.createStatement();
                statement.setQueryTimeout(30);  // set timeout to 30 sec.
                statement.executeUpdate("drop table if exists content");
                statement.executeUpdate("CREATE VIRTUAL TABLE content " + 
                                        "USING fts4(title, url, post)");
            }
            catch(SQLException e)
            {
                // if the error message is "out of memory", 
                // it probably means no database file is found
                System.err.println(e.getMessage());
                return;
            }


            /* Loop through the links */
            for (Element link : links)
            {
                /* Print out the link text */
                System.out.println("Processing " + link.text() + "...");

                Document currentDoc = null;
                try 
                {
                    /* Pull the current link page */
                    currentDoc = Jsoup.connect(link.attr("abs:href")).get();
                } 
                catch (IOException ex) 
                {
                    Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex);
                }

                /* Get the post Text */
                Element post = currentDoc.select("div.post").get(0);
                String postText = post.text();

                /* Print the post text */
                System.out.println("Post Text: " + postText);

                try
                {
                    PreparedStatement preparedStatement 
                      = connection.prepareStatement("insert into content values(?, ?, ? )");
                    preparedStatement.setString(1, link.text());
                    preparedStatement.setString(2, link.attr("abs:href"));
                    preparedStatement.setString(3, postText);
                    boolean isResultSet = preparedStatement.execute();
                    if (isResultSet)
                    {
                        System.out.println("A ResultSet was returned.");
                    }
                    else
                    {
                        System.out.println("No ResultSet was returned.");
                    }
                }
                catch(SQLException e)
                {
                    // if the error message is "out of memory", 
                    // it probably means no database file is found
                    System.err.println(e.getMessage());
                }

            }
        }
    }
}