Chris Pollett > Students >
Long

    ( Print View )

    [Bio]

    [Project Blog]

    [CS297 Proposal]

    [Del 1]

    [Del 2]

    [Del 3]

    [CS297Report-PDF]

    [CS298 Proposal]

                          

























Deliverable_2 Extracting Text in PDF and convert them into JPG images Program

Description: This program reads a PDF file. Then it goes through the contents in each page in this file and wraps the text elements. For each word is then drawn as a BufferedImage and which is saved to a JPG image. The program was developed in Java with iText Java library for PDF. This program was very useful to learn about PDF file structure, iText library, extracting objs in PDF and converting text into image.

Example:This is what my code outputs on these inputs.

This PDF word was extracted from a PDF file and saved as JPEG image

A PDF word in PDF file is converted into JPG image.

This jAvA word was extracted from a PDF file and saved as JPEG image

A jAvA word in PDF file is converted into JPG image.

/**
 * Project  : Extracting text in PDF and convert them into JPG images (Deleiverable #2)
 * File Name: Deliverable_2.java
 * Purpose  : This program will gets an pdf file, extracts text elements in the file
 *             and then convert them into JPG images
 * Created  : Oct 9th 2006
 * Last Modified: Oct 31st 2006
 * Java Version   : 1.5.0_08
 */
package myclasses;
import java.awt.*;
import java.awt.font.FontRenderContext;
import java.awt.font.TextLayout;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.File;
import com.lowagie.text.pdf.*;
import java.util.Iterator;
import javax.imageio.*;

/**
 * @author Nam Long
 *
 */
public class Deliverable_2 {
   /**
    * @purpose: extracting text from a PDF file
    * @param args
    */
   public static void main(String[] args) {
      File outputFile;
      BufferedImage img;
      BufferedImage img2;
      Graphics2D g;
      int counter = 0;
      Font font;
      String fontname = "Courier";
      int fonttype = Font.PLAIN;
      int fontsize = 12;


      // TODO Auto-generated method stub
      try {
         //Read a PDF file.
         PdfReader reader = new PdfReader("test.pdf");
         PdfDictionary rootCatalog = reader.getCatalog();

         //Get pages dictionary in the PDF file
         PdfDictionary pages = (PdfDictionary)reader.getPdfObject(
               ((PdfIndirectReference)rootCatalog.get(PdfName.PAGES)).getNumber());

         PdfArray kids = (PdfArray)pages.get(PdfName.KIDS);
         PdfIndirectReference kid_ref;
         PdfDictionary kid = null;

         //For each page, get the content stream and extract the text.
         for (Iterator i = kids.getArrayList().iterator(); i.hasNext(); ) {
            kid_ref = (PdfIndirectReference)i.next();

            kid = (PdfDictionary)reader.getPdfObject(kid_ref.getNumber());
            PdfIndirectReference content_ref =
               (PdfIndirectReference) kid.get(PdfName.CONTENTS);
               PRStream content = (PRStream)reader.getPdfObject(content_ref.getNumber());

               byte[] contentstream = PdfReader.getStreamBytes(content);
               PRTokeniser tokenizer = new PRTokeniser(contentstream);

               while (tokenizer.nextToken()) {
                  if (tokenizer.getTokenType() == PRTokeniser.TK_STRING) {
                     System.out.println(tokenizer.getStringValue());
                     String s = tokenizer.getStringValue();

                     s.trim();
                     String[] arrayStrings = s.split(" ");

                     //for each word
                     for (int n = 0; n < arrayStrings.length; n++) {
                        img = new BufferedImage(100, 20, BufferedImage.TYPE_INT_RGB);
                        g = img.createGraphics();

                        //Set Font
                        fontname = "Arial";
                        fontsize = 12;
                        fonttype = Font.PLAIN;
                        font = new Font(fontname, fonttype, fontsize);

                        g.setFont(font);
                        //g.setColor(Color.WHITE);

                        //Getting the boundary of the word.
                        Point2D loc = new Point(12,12);
                        FontRenderContext frc = g.getFontRenderContext();
                        TextLayout layout = new TextLayout(arrayStrings[n], font, frc);
                        layout.draw(g, (float)loc.getX(), (float)loc.getY());

                        Rectangle2D bounds = layout.getBounds();
                        bounds.setRect(bounds.getX()+loc.getX(),
                                    bounds.getY()+loc.getY(),
                                    bounds.getWidth(),
                                    bounds.getHeight());

                        g.draw(bounds);

                        int iWidth = (int)bounds.getWidth() + 5;
                        int iHeight = (int) bounds.getHeight() + 3;

                        img2 = new BufferedImage(iWidth, iHeight, BufferedImage.TYPE_INT_RGB);
                        g  = img2.createGraphics();

                        //painting the background
                        g.setColor(Color.WHITE);
                        g.fillRect(0, 0,iWidth, iHeight);

                        //drawing the world
                        g.setColor(Color.BLACK);
                        g.drawString(arrayStrings[n], 1, 10);

                        outputFile = new File(++counter + " image.jpg");
                        ImageIO.write(img2, "JPG", outputFile);
                        img.flush();
                        img2.flush();
                        g.dispose();
                     }
                  }
               }
         }
      }
      catch (Exception e){
         System.out.println(e.getStackTrace());
      }

   }
}// end class Deliverable_2