Tuesday, 16 June 2020

Extract Text and Images from Word in Java

Picking out text and images and saving them manually can be a long and frustrating process, especially in a large file with lots of pages. Fortunately, there’s a method that makes the process quite simple. Follow the tutorial below in order to extract text and images from a Word document in an easy way.

Before getting started, please download Free Spire.Doc for Java package through this link, unzip the package and then import Spire.Doc.jar fromlib folder into our application.

Extract Text

import com.spire.doc.Document;
import java.io.FileWriter;
import java.io.IOException;
public class ExtractText {
public static void main(String[] args) throws IOException {
//load Word document
Document document = new Document();
document.loadFromFile(
"C:\\Users\\Test1\\Desktop\\Sample.docx");
//get text from document as string
String text=document.getText();
//write string to a .txt file
writeStringToTxt(text,"output/ExtractedText.txt");
    }
public static void writeStringToTxt(String content, String txtFileName) throws IOException{
FileWriter fWriter=
new FileWriter(txtFileName,true);
try {

fWriter.write(content);
    }
catch(IOException ex){
       ex.printStackTrace();
        }
finally{
           
try{
                fWriter.flush();
                fWriter.close();
            }
catch (IOException ex) {
                ex.printStackTrace();
            }
        }
    }
}

Output

Extract Images

import com.spire.doc.Document;
import com.spire.doc.documents.DocumentObjectType;
import com.spire.doc.fields.DocPicture;
import com.spire.doc.interfaces.ICompositeObject; import com.spire.doc.interfaces.IDocumentObject;
import javax.imageio.ImageIO;
import java.awt.image.RenderedImage;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
public class ExtractImage {
   
public static void main(String[] args)throws IOException {
       
//load word document
       
Document document = new Document();
        document.loadFromFile(
"C:\\Users\\Test1\\Desktop\\Sample.docx");
      
//create a Queue object
       
Queue nodes = new LinkedList();
        nodes.add(document);
       
//create a List object
       
List images = new ArrayList();
        
//loop through the child objects of the document
       
while (nodes.size() > 0) {
            ICompositeObject node = (ICompositeObject) nodes.poll();
           
for (int i = 0; i < node.getChildObjects().getCount(); i++) {
                IDocumentObject child = node.getChildObjects().get(i);
               
if (child instanceof ICompositeObject) {
                    nodes.add((ICompositeObject) child);
                   
//get each image and add it to the list
                    
if (child.getDocumentObjectType() == DocumentObjectType.Picture) {
                        DocPicture picture = (DocPicture) child;
                        images.add(picture.getImage());
                    }
                }
            }
        }
       
//save images as .png files
       
for (int i = 0; i < images.size(); i++) {
            File file =
new File(String.format("output/ExtractedImage.png", i));
            ImageIO.write((RenderedImage) images.get(i),
"PNG", file);
        }
    }
}
Output


No comments:

Post a Comment

Change PDF Versions in Java

In daily work, you might need to change the version of a PDF document you have in order to ensure compatibility with another version which a...