How do I split large excel file into multiple smaller files?

A friend of mine told me that he has a large Excel file, and he asked me if I could split the file into multiple smaller Excel files. So I write this little program using Apache POI to do it.

The code snippet below basically contains the following steps:

  1. Load the Excel as an InputStream from the classpath, if the file was not found the program will exit.
  2. Create XSSFWorkbook from the input stream, and get the first XSSFSheet from the workbook.
  3. Iterate the rows of data from the source worksheet.
  4. On the first rownum for each split file we create a new workbook using SXSSFWorkbook and also create the SXSSFSheet.
  5. Read the first row from the source worksheet, store it in headerRow to be used for creating header row in each new sheet.
  6. If we are at the first row, write the header.
  7. Write each row from the source sheet to the destination sheet untuk the max rows is reached.
  8. Write the workbook into a new file.

And here is the complete code that you can try.

package org.kodejava.poi;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.streaming.SXSSFCell;
import org.apache.poi.xssf.streaming.SXSSFRow;
import org.apache.poi.xssf.streaming.SXSSFSheet;
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.time.LocalDateTime;
import java.time.temporal.ChronoUnit;

public class SplitExcelDemo {
    public static final int MAX_ROWS_PER_FILE = 1000 - 1;
    public static final boolean WITH_HEADER = true;

    public static void main(String[] args) {
        LocalDateTime startTime = LocalDateTime.now();
        String filename = "/stock.xlsx";
        try (InputStream is = SplitExcelDemo.class.getResourceAsStream(filename)) {
            if (is == null) {
                System.out.println("Source file was not found!");
                System.exit(1);
            }

            XSSFWorkbook srcWorkbook = new XSSFWorkbook(is);
            XSSFSheet srcSheet = srcWorkbook.getSheetAt(0);
            int physicalNumberOfRows = srcSheet.getPhysicalNumberOfRows();

            int rownum = 0;
            int splitCounter = 0;

            SXSSFWorkbook destWorkbook = null;
            SXSSFSheet destSheet = null;

            XSSFRow headerRow = null;

            for (Row srcRow : srcSheet) {
                if (rownum == 0) {
                    // At the beginning let's create a new workbook and worksheet
                    destWorkbook = new SXSSFWorkbook();
                    destSheet = destWorkbook.createSheet();
                }

                if (srcRow.getRowNum() == 0 && WITH_HEADER) {
                    // Copy header row to be use in each split file
                    headerRow = (XSSFRow) srcRow;
                }

                if (rownum == 0 && WITH_HEADER) {
                    // Add row header to each split file
                    if (headerRow != null) {
                        SXSSFRow firstRow = destSheet.createRow(rownum);
                        int index = 0;
                        for (Cell cell : headerRow) {
                            SXSSFCell headerCell = firstRow.createCell(index++, cell.getCellType());
                            if (cell.getCellType() == CellType.STRING) {
                                headerCell.setCellValue(cell.getStringCellValue());
                            }
                        }
                    }
                } else {
                    // Copy rows from source worksheet into destination worksheet
                    SXSSFRow descRow = destSheet.createRow(rownum);
                    int index = 0;
                    for (Cell cell : srcRow) {
                        SXSSFCell destCell = descRow.createCell(index++, cell.getCellType());
                        switch (cell.getCellType()) {
                            case NUMERIC -> destCell.setCellValue(cell.getNumericCellValue());
                            case STRING -> destCell.setCellValue(cell.getStringCellValue());
                        }
                    }
                }

                // When a max number of rows copied are reached, or when we are at the end of worksheet, 
                // write data into a new file 
                if (rownum == MAX_ROWS_PER_FILE || srcRow.getRowNum() == physicalNumberOfRows - 1) {
                    rownum = -1;
                    String output = String.format("split-%03d.xlsx", splitCounter++);
                    System.out.println("Writing " + output);
                    try (OutputStream os = new FileOutputStream(output)) {
                        destWorkbook.write(os);
                    } catch (IOException e){
                        e.printStackTrace();
                    }
                }

                rownum = rownum + 1;
            }

            // Display processing time
            LocalDateTime endTime = LocalDateTime.now();
            long minutes = startTime.until(endTime, ChronoUnit.MINUTES);
            startTime = startTime.plusMinutes(minutes);
            long seconds = startTime.until(endTime, ChronoUnit.SECONDS);
            System.out.printf("Splitting finished in %d minutes and %d seconds %n", minutes, seconds);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

Maven Dependencies

<dependencies>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi</artifactId>
        <version>5.2.3</version>
    </dependency>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml</artifactId>
        <version>5.2.3</version>
    </dependency>
</dependencies>

Maven Central Maven Central

The output of running this program will look like this:

Writing split-000.xlsx
Writing split-001.xlsx
Writing split-002.xlsx
Writing split-003.xlsx
Writing split-004.xlsx
Writing split-005.xlsx
Writing split-006.xlsx
Writing split-007.xlsx
Writing split-008.xlsx
Writing split-009.xlsx
Splitting finished in 0 minutes and 8 seconds 

Why do I get NotOLE2FileException: Invalid header signature error?

When I was creating the code snippet for the How do I replace text in Microsoft Word document using Apache POI? I got the following error when running the snippet . As additional information, the code snippet for the Kodejava website is written as a Maven project.

org.apache.poi.poifs.filesystem.NotOLE2FileException: Invalid header signature; read 0xE011BDBFEFBDBFEF, expected 0xE11AB1A1E011CFD0 – Your file appears not to be a valid OLE2 document

This error message was produce when the code trying to open a Word document. The Apache POI reports that the Word document has an invalid header signature, not a valid OLE2 document. Comparing the original document with the document under maven’s target directory I found out that the file size was different. This could mean that something alter the document and corrupt the header.

After doing some googling, I found out that this error is due to maven resource filtering. The maven resource filtering process cause the Word document header corrupt during the copy phase. The solution to this problem is to make sure that the filtering process is set to false. The pom.xml of the maven project should be altered to have the following configuration.

<build>
    <resources>
        <resource>
            <directory>src/main/resources</directory>
            <filtering>false</filtering>
        </resource>
    </resources>
</build>

How do I replace text in Microsoft Word document using Apache POI?

The code snippet below show you how you can replace string in Microsoft Word document using the Apache POI library. The class below have three method, the openDocument(), saveDocument() and replaceText().

The routine for replacing text is implemented in the replaceText() method. This method take the HWPFDocument, the String to find and the String to replace it as parameters. The openDocument() opens the Word document. When the text replacement is done the Word document will be saved by the saveDocument() method.

And here is the complete code snippet. It will replace every dolor texts into d0l0r texts in the source document, the lipsum.doc, and save the result in a new document called new-lipsum.doc.

package org.kodejava.poi;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;

public class WordReplaceText {
    private static final String SOURCE_FILE = "lipsum.doc";
    private static final String OUTPUT_FILE = "new-lipsum.doc";

    public static void main(String[] args) throws Exception {
        WordReplaceText instance = new WordReplaceText();
        try (HWPFDocument doc = instance.openDocument(SOURCE_FILE)) {
            if (doc != null) {
                HWPFDocument newDoc = instance.replaceText(doc, "dolor", "d0l0r");
                instance.saveDocument(newDoc, OUTPUT_FILE);
            }
        }
    }

    private HWPFDocument replaceText(HWPFDocument doc, String findText, String replaceText) {
        Range range = doc.getRange();
        for (int numSec = 0; numSec < range.numSections(); ++numSec) {
            Section sec = range.getSection(numSec);
            for (int numPara = 0; numPara < sec.numParagraphs(); numPara++) {
                Paragraph para = sec.getParagraph(numPara);
                for (int numCharRun = 0; numCharRun < para.numCharacterRuns(); numCharRun++) {
                    CharacterRun charRun = para.getCharacterRun(numCharRun);
                    String text = charRun.text();
                    if (text.contains(findText)) {
                        charRun.replaceText(findText, replaceText);
                    }
                }
            }
        }
        return doc;
    }

    private HWPFDocument openDocument(String file) throws Exception {
        URL res = getClass().getClassLoader().getResource(file);
        HWPFDocument document = null;
        if (res != null) {
            document = new HWPFDocument(new POIFSFileSystem(
                    new File(res.getPath())));
        }
        return document;
    }

    private void saveDocument(HWPFDocument doc, String file) {
        try (FileOutputStream out = new FileOutputStream(file)) {
            doc.write(out);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

Maven Dependencies

<dependencies>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi</artifactId>
        <version>5.2.3</version>
    </dependency>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-scratchpad</artifactId>
        <version>5.2.3</version>
    </dependency>
</dependencies>

Maven Central Maven Central

How do I check if a cell contains a date value?

The code below check if the first cell of an Excel file contains a date value. In Excel the cell type for date is returned as HSSFCell.CELL_TYPE_NUMERIC, to make sure if it contains a date we can use a utility method DateUtil.isCellDateFormatted(Cell cell), this method will check if the cell value is a valid date.

package org.kodejava.poi;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.DateUtil;

import java.io.FileInputStream;
import java.io.FileNotFoundException;

public class DateCellType {
    public static void main(String[] args) throws Exception {
        String filename = "datecelltype.xls";

        try (FileInputStream fis = new FileInputStream(filename)) {
            HSSFWorkbook workbook = new HSSFWorkbook(fis);
            HSSFSheet sheet = workbook.getSheetAt(0);

            // Read a cell the first cell on the sheet.
            HSSFCell cell = sheet.getRow(0).getCell(0);
            if (cell.getCellType() == CellType.NUMERIC) {
                System.out.println("Cell type for date data type is numeric.");
            }

            // Using HSSFDateUtil to check if a cell contains a date.
            if (DateUtil.isCellDateFormatted(cell)) {
                System.out.println("The cell contains a date value: "
                        + cell.getDateCellValue());
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
    }
}

The example result of the code snippet:

Cell type for date data type is numeric.
The cell contains a date value: Fri Oct 08 00:00:00 CST 2021

Maven Dependencies

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>5.2.3</version>
</dependency>

Maven Central

How do I obtain Excel’s cell data type?

In this example we try to obtain the Excel’s cell data type so that we can read the value using the right method. The data to be read is in a file named celltype.xls. The matrix below depict how the file is.

    |   COL
ROW |   0       1   2   3   4
----|-------------------------
0   |   1       2   A   B   TRUE
1   |   FALSE   X   Y   Z   10
package org.kodejava.poi;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.Row;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.Iterator;

public class ObtainingCellType {

    public static void main(String[] args) throws Exception {
        String filename = "celltype.xls";

        try (FileInputStream fis = new FileInputStream(filename)) {
            HSSFWorkbook workbook = new HSSFWorkbook(fis);
            HSSFSheet sheet = workbook.getSheetAt(0);

            Iterator<Row> rows = sheet.rowIterator();
            while (rows.hasNext()) {
                HSSFRow row = (HSSFRow) rows.next();

                Iterator<Cell> cells = row.cellIterator();
                while (cells.hasNext()) {
                    HSSFCell cell = (HSSFCell) cells.next();

                    CellType type = cell.getCellType();
                    if (type == CellType.STRING) {
                        System.out.printf("[%d, %d] = STRING; Value = %s%n",
                                cell.getRowIndex(), cell.getColumnIndex(),
                                cell.getRichStringCellValue().toString());
                    } else if (type == CellType.NUMERIC) {
                        System.out.printf("[%d, %d] = NUMERIC; Value = %f%n",
                                cell.getRowIndex(), cell.getColumnIndex(),
                                cell.getNumericCellValue());
                    } else if (type == CellType.BOOLEAN) {
                        System.out.printf("[%d, %d] = BOOLEAN; Value = %b%n",
                                cell.getRowIndex(), cell.getColumnIndex(),
                                cell.getBooleanCellValue());
                    } else if (type == CellType.BLANK) {
                        System.out.printf("[%d, %d] = BLANK CELL%n",
                                cell.getRowIndex(), cell.getColumnIndex());
                    }
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
    }
}

Our program iterates the Excel file rows and cells and produce the following
output:

[0, 0] = NUMERIC; Value = 1.000000
[0, 1] = NUMERIC; Value = 2.000000
[0, 2] = STRING; Value = A
[0, 3] = STRING; Value = B
[0, 4] = BOOLEAN; Value = true
[1, 0] = BOOLEAN; Value = false
[1, 1] = STRING; Value = X
[1, 2] = STRING; Value = Y
[1, 3] = STRING; Value = Z
[1, 4] = NUMERIC; Value = 10.000000

Maven Dependencies

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>5.2.3</version>
</dependency>

Maven Central

How do I read Excel file?

In this example we demonstrate how to read data from an Excel file. In this example we limit to read a string data only. After reading the data, iterating each row and cells of the Excel file we display the content to the program console.

package org.kodejava.poi;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;

import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

public class ExcelReadExample {

    public static void main(String[] args) {
        // Excel file name. You can create a file name with a full
        // path information.
        String filename = "data.xls";

        // Create an ArrayList to store the data read from Excel sheet.
        List<List<HSSFCell>> sheetData = new ArrayList<>();

        try (FileInputStream fis = new FileInputStream(filename)) {
            // Create an Excel workbook from the file system.
            HSSFWorkbook workbook = new HSSFWorkbook(fis);
            // Get the first sheet on the workbook.
            HSSFSheet sheet = workbook.getSheetAt(0);

            // When we have a sheet object in hand we can iterator on
            // each sheet's rows and on each row's cells. We store the
            // data read on an ArrayList so that we can print the
            // content of the Excel to the console.
            Iterator<Row> rows = sheet.rowIterator();
            while (rows.hasNext()) {
                HSSFRow row = (HSSFRow) rows.next();
                Iterator<Cell> cells = row.cellIterator();

                List<HSSFCell> data = new ArrayList<>();
                while (cells.hasNext()) {
                    HSSFCell cell = (HSSFCell) cells.next();
                    data.add(cell);
                }
                sheetData.add(data);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        showExcelData(sheetData);
    }

    private static void showExcelData(List<List<HSSFCell>> sheetData) {
        // Iterates the data and print it out to the console.
        for (List<HSSFCell> data : sheetData) {
            for (HSSFCell cell : data) {
                System.out.println(cell);
            }
        }
    }
}

Our program above print out the following lines:

AAAAAAAAAA
BBBBBBBBBB
CCCCCCCCCC
DDDDDDDDDD
EEEEEEEEEE
FFFFFFFFFF
GGGGGGGGGG
HHHHHHHHHH
IIIIIIIIII
JJJJJJJJJJ

Maven Dependencies

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>5.2.3</version>
</dependency>

Maven Central

How do I format cell style in Excel document?

This example demonstrate how to use HSSFCellStyle and HSSFFont to format the cell style in Excel document. Using this class we can define cell border, foreground and background color. We can also define the font we used to display the cell value.

package org.kodejava.poi;

import org.apache.poi.hssf.usermodel.*;
import org.apache.poi.hssf.util.HSSFColor;
import org.apache.poi.ss.usermodel.BorderStyle;
import org.apache.poi.ss.usermodel.FillPatternType;

import java.io.FileOutputStream;
import java.io.IOException;

public class ExcelCellFormat {
    public static void main(String[] args) {
        // Create an instance of workbook and sheet
        try (HSSFWorkbook workbook = new HSSFWorkbook()) {
            HSSFSheet sheet = workbook.createSheet();

            // Create an instance of HSSFCellStyle which will be used to format the
            // cell. Here we define the cell top and bottom border, and we also
            // define the background color.
            HSSFCellStyle style = workbook.createCellStyle();
            style.setBorderTop(BorderStyle.DOUBLE);
            style.setBorderBottom(BorderStyle.THIN);
            style.setFillForegroundColor(
                    HSSFColor.HSSFColorPredefined.GREY_25_PERCENT.getIndex());
            style.setFillPattern(FillPatternType.SOLID_FOREGROUND);

            // We also define the font that we are going to use for displaying the
            // data of the cell. We set the font to ARIAL with 20pt in size and
            // make it BOLD and give blue as the color.
            HSSFFont font = workbook.createFont();
            font.setFontName(HSSFFont.FONT_ARIAL);
            font.setFontHeightInPoints((short) 20);
            font.setBold(true);
            font.setColor(HSSFColor.HSSFColorPredefined.BLUE.getIndex());
            style.setFont(font);

            // We create a simple cell, set its value and apply the cell style.
            HSSFRow row = sheet.createRow(1);
            HSSFCell cell = row.createCell(1);
            cell.setCellValue(new HSSFRichTextString("Hi there... It's me again!"));
            cell.setCellStyle(style);
            sheet.autoSizeColumn((short) 1);

            // Finally, we write out the workbook into an Excel file.
            try (FileOutputStream fos = new FileOutputStream("ExcelDemo.xls")) {
                workbook.write(fos);
            } catch (IOException e) {
                e.printStackTrace();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

Maven Dependencies

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>5.2.3</version>
</dependency>

Maven Central

How do I create an Excel document using Apache POI?

This example demonstrate how to create an Excel document using Apache POI library. In this example we create a simple document containing two sheets which have a value on their first cell.

package org.kodejava.poi;

import org.apache.poi.hssf.usermodel.*;

import java.io.FileOutputStream;
import java.io.IOException;

public class CreateExcelDemo {
    public static void main(String[] args) {
        // Creating an instance of HSSFWorkbook.
        try (HSSFWorkbook workbook = new HSSFWorkbook()) {

            // Create two sheets in the Excel document and name it First Sheet and
            // Second Sheet.
            HSSFSheet firstSheet = workbook.createSheet("FIRST SHEET");
            HSSFSheet secondSheet = workbook.createSheet("SECOND SHEET");

            // Manipulate the first sheet by creating an HSSFRow which represent a
            // single row in Excel sheet, the first row started from 0 index. After
            // the row is created we create a HSSFCell in this first cell of the row
            // and set the cell value with an instance of HSSFRichTextString
            // containing the words FIRST SHEET.
            HSSFRow rowA = firstSheet.createRow(0);
            HSSFCell cellA = rowA.createCell(0);
            cellA.setCellValue(new HSSFRichTextString("FIRST SHEET"));

            HSSFRow rowB = secondSheet.createRow(0);
            HSSFCell cellB = rowB.createCell(0);
            cellB.setCellValue(new HSSFRichTextString("SECOND SHEET"));

            // To write out the workbook into a file we need to create an output
            // stream where the workbook content will be written to.
            try (FileOutputStream fos = new FileOutputStream("CreateExcelDemo.xls")) {
                workbook.write(fos);
            } catch (IOException e) {
                e.printStackTrace();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

Maven Dependencies

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>5.2.3</version>
</dependency>

Maven Central