- import org.apache.log4j.LogManager;
- import org.apache.log4j.Logger;
- import java.io.*;
- import java.util.*;
- import java.util.concurrent.*;
- public class FileSplitUtil {
- private final static Logger log = LogManager.getLogger(FileSplitUtil.class);
- private static final long originFileSize = 1024 * 1024 * 100;// 100M
- private static final int blockFileSize = 1024 * 1024 * 64;// 防止中文乱码,必须取2的N次方
- /**
- * CVS文件分隔符
- */
- private static final char cvsSeparator = '^';
- public static void main(String args[]){
- long start = System.currentTimeMillis();
- try {
- String fileName = "D:\\csvtest\\aa.csv";
- File sourceFile = new File(fileName);
- if (sourceFile.length() >= originFileSize) {
- String cvsFileName = fileName.replaceAll("\\\\", "/");
- FileSplitUtil fileSplitUtil = new FileSplitUtil();
- List<String> parts=fileSplitUtil.splitBySize(cvsFileName, blockFileSize);
- for(String part:parts){
- System.out.println("partName is:"+part);
- }
- }
- System.out.println("总文件长度"+sourceFile.length()+",拆分文件耗时:" + (System.currentTimeMillis() - start) + "ms.");
- }catch (Exception e){
- log.info(e.getStackTrace());
- }
- }
- /**
- * 拆分文件
- *
- * @param fileName 待拆分的完整文件名
- * @param byteSize 按多少字节大小拆分
- * @return 拆分后的文件名列表
- */
- public List<String> splitBySize(String fileName, int byteSize)
- throws IOException, InterruptedException {
- List<String> parts = new ArrayList<String>();
- File file = new File(fileName);
- int count = (int) Math.ceil(file.length() / (double) byteSize);
- int countLen = (count + "").length();
- RandomAccessFile raf = new RandomAccessFile(fileName, "r");
- long totalLen = raf.length();
- CountDownLatch latch = new CountDownLatch(count);
- for (int i = 0; i < count; i++) {
- String partFileName = file.getPath() + "."
- + leftPad((i + 1) + "", countLen, '0') + ".cvs";
- int readSize=byteSize;
- long startPos=(long)i * byteSize;
- long nextPos=(long)(i+1) * byteSize;
- if(nextPos>totalLen){
- readSize= (int) (totalLen-startPos);
- }
- new SplitRunnable(readSize, startPos, partFileName, file, latch).run();
- parts.add(partFileName);
- }
- latch.await();//等待所有文件写完
- //由于切割时可能会导致行被切断,加工所有的的分割文件,合并行
- mergeRow(parts);
- return parts;
- }
- /**
- * 分割处理Runnable
- *
- * @author supeidong
- */
- private class SplitRunnable implements Runnable {
- int byteSize;
- String partFileName;
- File originFile;
- long startPos;
- CountDownLatch latch;
- public SplitRunnable(int byteSize, long startPos, String partFileName,
- File originFile, CountDownLatch latch) {
- this.startPos = startPos;
- this.byteSize = byteSize;
- this.partFileName = partFileName;
- this.originFile = originFile;
- this.latch = latch;
- }
- public void run() {
- RandomAccessFile rFile;
- OutputStream os;
- try {
- rFile = new RandomAccessFile(originFile, "r");
- byte[] b = new byte[byteSize];
- rFile.seek(startPos);// 移动指针到每“段”开头
- int s = rFile.read(b);
- os = new FileOutputStream(partFileName);
- os.write(b, 0, s);
- os.flush();
- os.close();
- latch.countDown();
- } catch (IOException e) {
- log.error(e.getMessage());
- latch.countDown();
- }
- }
- }
- /**
- * 合并被切断的行
- *
- * @param parts
- */
- private void mergeRow(List<String> parts) {
- List<PartFile> partFiles = new ArrayList<PartFile>();
- try {
- //组装被切分表对象
- for (int i=0;i<parts.size();i++) {
- String partFileName=parts.get(i);
- File splitFileTemp = new File(partFileName);
- if (splitFileTemp.exists()) {
- PartFile partFile = new PartFile();
- BufferedReader reader=new BufferedReader(new InputStreamReader(new FileInputStream(splitFileTemp),"gbk"));
- String firstRow = reader.readLine();
- String secondRow = reader.readLine();
- String endRow = readLastLine(partFileName);
- partFile.setPartFileName(partFileName);
- partFile.setFirstRow(firstRow);
- partFile.setEndRow(endRow);
- if(i>=1){
- String prePartFile=parts.get(i - 1);
- String preEndRow = readLastLine(prePartFile);
- partFile.setFirstIsFull(getCharCount(firstRow+preEndRow)>getCharCount(secondRow));
- }
- partFiles.add(partFile);
- reader.close();
- }
- }
- //进行需要合并的行的写入
- for (int i = 0; i < partFiles.size() - 1; i++) {
- PartFile partFile = partFiles.get(i);
- PartFile partFileNext = partFiles.get(i + 1);
- StringBuilder sb = new StringBuilder();
- if (partFileNext.getFirstIsFull()) {
- sb.append("\r\n");
- sb.append(partFileNext.getFirstRow());
- } else {
- sb.append(partFileNext.getFirstRow());
- }
- writeLastLine(partFile.getPartFileName(),sb.toString());
- }
- } catch (Exception e) {
- log.error(e.getMessage());
- }
- }
- /**
- * 得到某个字符出现的次数
- * @param s
- * @return
- */
- private int getCharCount(String s) {
- int count = 0;
- for (int i = 0; i < s.length(); i++) {
- if (s.charAt(i) == cvsSeparator) {
- count++;
- }
- }
- return count;
- }
- /**
- * 采用BufferedInputStream方式读取文件行数
- *
- * @param filename
- * @return
- */
- public int getFileRow(String filename) throws IOException {
- InputStream is = new BufferedInputStream(new FileInputStream(filename));
- byte[] c = new byte[1024];
- int count = 0;
- int readChars = 0;
- while ((readChars = is.read(c)) != -1) {
- for (int i = 0; i < readChars; ++i) {
- if (c[i] == '\n')
- ++count;
- }
- }
- is.close();
- return count;
- }
- /**
- * 读取最后一行数据
- * @param filename
- * @return
- * @throws IOException
- */
- private String readLastLine(String filename) throws IOException {
- // 使用RandomAccessFile , 从后找最后一行数据
- RandomAccessFile raf = new RandomAccessFile(filename, "r");
- long len = raf.length();
- String lastLine = "";
- if(len!=0L) {
- long pos = len - 1;
- while (pos > 0) {
- pos--;
- raf.seek(pos);
- if (raf.readByte() == '\n') {
- lastLine = raf.readLine();
- lastLine=new String(lastLine.getBytes("8859_1"), "gbk");
- break;
- }
- }
- }
- raf.close();
- return lastLine;
- }
- /**
- * 修改最后一行数据
- * @param fileName
- * @param lastString
- * @return
- * @throws IOException
- */
- private void writeLastLine(String fileName,String lastString){
- try {
- // 打开一个随机访问文件流,按读写方式
- RandomAccessFile randomFile = new RandomAccessFile(fileName, "rw");
- // 文件长度,字节数
- long fileLength = randomFile.length();
- //将写文件指针移到文件尾。
- randomFile.seek(fileLength);
- //此处必须加gbk,否则会出现写入乱码
- randomFile.write(lastString.getBytes("gbk"));
- randomFile.close();
- } catch (IOException e) {
- log.error(e.getMessage());
- }
- }
- /**
- * 左填充
- *
- * @param str
- * @param length
- * @param ch
- * @return
- */
- public static String leftPad(String str, int length, char ch) {
- if (str.length() >= length) {
- return str;
- }
- char[] chs = new char[length];
- Arrays.fill(chs, ch);
- char[] src1= str.toCharArray();
- System.arraycopy(src, 0, chs, length - src.length, src.length);
- return new String(chs);
- }
- /**
- * 合并文件行内部类
- */
- class PartFile {
- private String partFileName;
- private String firstRow;
- private String endRow;
- private boolean firstIsFull;
- public String getPartFileName() {
- return partFileName;
- }
- public void setPartFileName(String partFileName) {
- this.partFileName = partFileName;
- }
- public String getFirstRow() {
- return firstRow;
- }
- public void setFirstRow(String firstRow) {
- this.firstRow = firstRow;
- }
- public String getEndRow() {
- return endRow;
- }
- public void setEndRow(String endRow) {
- this.endRow = endRow;
- }
- public boolean getFirstIsFull() {
- return firstIsFull;
- }
- public void setFirstIsFull(boolean firstIsFull) {
- this.firstIsFull = firstIsFull;
- }
- }
- }
来源: http://www.phpxs.com/code/1009802/