Picking
out text and images and saving them manually can be a long and frustrating
process, especially in a large file with lots of pages. Fortunately, there’s a
method that makes the process quite simple. Follow the tutorial below in order
to extract text and images from a Word document in an easy way.
Before getting started, please download Free Spire.Doc for
Java package through this link, unzip the package and then import
Spire.Doc.jar from"lib" folder into our application.
Extract Text
import com.spire.doc.Document;
import java.io.FileWriter;
import java.io.IOException;
public class ExtractText {
public static void main(String[] args) throws IOException {
//load Word document
Document document = new Document();
document.loadFromFile("C:\\Users\\Test1\\Desktop\\Sample.docx");
//get text from document as string
String text=document.getText();
//write string to a .txt file
writeStringToTxt(text,"output/ExtractedText.txt");
}
public static void writeStringToTxt(String content, String txtFileName) throws IOException{
FileWriter fWriter= new FileWriter(txtFileName,true);
try {fWriter.write(content);
Output
}catch(IOException ex){
ex.printStackTrace();
}finally{
try{
fWriter.flush();
fWriter.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
}Extract Images
import com.spire.doc.Document;Output
import com.spire.doc.documents.DocumentObjectType;
import com.spire.doc.fields.DocPicture;
import com.spire.doc.interfaces.ICompositeObject; import com.spire.doc.interfaces.IDocumentObject;
import javax.imageio.ImageIO;
import java.awt.image.RenderedImage;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
public class ExtractImage {
public static void main(String[] args)throws IOException {
//load word document
Document document = new Document();
document.loadFromFile("C:\\Users\\Test1\\Desktop\\Sample.docx");
//create a Queue object
Queue nodes = new LinkedList();
nodes.add(document);
//create a List object
List images = new ArrayList();
//loop through the child objects of the document
while (nodes.size() > 0) {
ICompositeObject node = (ICompositeObject) nodes.poll();
for (int i = 0; i < node.getChildObjects().getCount(); i++) {
IDocumentObject child = node.getChildObjects().get(i);
if (child instanceof ICompositeObject) {
nodes.add((ICompositeObject) child);
//get each image and add it to the list
if (child.getDocumentObjectType() == DocumentObjectType.Picture) {
DocPicture picture = (DocPicture) child;
images.add(picture.getImage());
}
}
}
}
//save images as .png files
for (int i = 0; i < images.size(); i++) {
File file = new File(String.format("output/ExtractedImage.png", i));
ImageIO.write((RenderedImage) images.get(i), "PNG", file);
}
}
}
No comments:
Post a Comment