项目中需要将巨量数据生成的 JSON 文件解析, 并写入数据库, 使用了 alibaba 的 fastjson, 在实践过程中遇到了 GC 问题, 记录如下:
数据大约为 70 万条, 文件大小在 3~4G 左右, 使用 fastjson 官方推荐的 Stream API 例 3 的示例, 在读取到 30 万数据时, 内存使用量开始迅速上升, CPU 也迅速达到百分之百, 在读取到 40 万数据左右时, 出现 GC.
代码如下:
- import com.alibaba.fastjson.JSONObject;
- import com.alibaba.fastjson.JSONReader;
- import lombok.extern.slf4j.Slf4j;
- import org.apache.commons.lang3.StringUtils;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
- import org.springframework.jdbc.core.namedparam.SqlParameterSourceUtils;
- import org.springframework.stereotype.Component;
- import java.io.*;
- import java.util.*;
- @Component
- @Slf4j
- public class EnterDatabaseUtils {
- @Autowired
- private NamedParameterJdbcTemplate namedParameterJdbcTemplate;
- private final int batchTotal = 50000;
- public boolean enterData(String databaseName, String tableName, File file, String[] fields) {
- String fileName = file.getName();
- try {
- JSONReader reader = new JSONReader(new InputStreamReader(new FileInputStream(file.getAbsoluteFile()),"UTF-8"));
- String insertSql = "INSERT INTO `" + databaseName + "`.`" + tableName + "`"
- + "(`" + StringUtils.join(fields, "`,`") + "`)"
- + "VALUES(:" + StringUtils.join(fields, ",:") + ")";
- long count = 1;
- ArrayList<Map<String, Object>> recordList = new ArrayList<>();
- reader.startArray();
- while (reader.hasNext()) {
- reader.startObject();
- JSONObject = reader.readObject(JSONObject.class);
- if (count <= batchTotal) {
- recordList.add(record);
- count ++;
- }
- if (batchTotal + 1 == count) {
- namedParameterJdbcTemplate.batchUpdate(insertSql, SqlParameterSourceUtils.createBatch(recordList));
- count = 1;
- recordList.clear();
- }
- }
- if (recordList.size()> 0) {
- namedParameterJdbcTemplate.batchUpdate(insertSql, SqlParameterSourceUtils.createBatch(recordList));
- recordList.clear();
- }
- reader.endArray();
- reader.close();
- return true;
- } catch (Exception e) {
- log.error(databaseName + "." + tableName + ": 插入失败");
- log.error("", e);
- return false;
- }
- }
- }
测试代码:
- import org.junit.Test;
- import org.junit.runner.RunWith;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.boot.test.context.SpringBootTest;
- import org.springframework.test.context.junit4.SpringRunner;
- import java.io.File;
- @RunWith(SpringRunner.class)
- @SpringBootTest
- public class EnterDatabaseUtilsTest {
- @Autowired
- private EnterDatabaseUtils enterDatabaseUtils;
- @Test
- public void testEnterDatabase() {
- File file = new File("/xxx/xxx/xxx.json");
- String[] fields = {........};
- boolean res = enterDatabaseUtils.enterData("xxxx", "xxxx", file, );
- }
- }
开始的时候, 怀疑是 namedParameterJdbcTemplate 引起的内存占用疯涨. 但是将所有的数据库相关操作删除, 仅保留 JSON 读取代码, 内存仍然疯涨并导致 GC.
遂怀疑是 fastjson 使用不当, 阅读了大量文章之后, 终于在 JSON 少量数据解析 https://www.jianshu.com/p/912ff0dcd249 一文中找到了答案: 单行直接 readObject 会导致内存不断被消耗!
将代码改为使用 startObject 将每行中的 key,value 单独解析, 内存和 CPU 占用稳定无增长, 问题解决.
改进后的代码如下:
- import com.alibaba.fastjson.JSONObject;
- import com.alibaba.fastjson.JSONReader;
- import lombok.extern.slf4j.Slf4j;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
- import org.springframework.jdbc.core.namedparam.SqlParameterSourceUtils;
- import org.springframework.stereotype.Component;
- import java.io.*;
- import java.util.*;
- @Component
- @Slf4j
- public class EnterDatabaseUtils {
- @Autowired
- private NamedParameterJdbcTemplate namedParameterJdbcTemplate;
- private final int batchTotal = 50000;
- public boolean enterData(String databaseName, String tableName, File file, String[] fields) {
- String fileName = file.getName();
- try {
- JSONReader reader = new JSONReader(new InputStreamReader(new FileInputStream(file.getAbsoluteFile()),"UTF-8"));
- String insertSql = "INSERT INTO `" + databaseName + "`.`" + tableName + "`"
- + "(`" + StringUtils.join(fields, "`,`") + "`)"
- + "VALUES(:" + StringUtils.join(fields, ",:") + ")";
- long count = 1;
- ArrayList<Map<String, Object>> recordList = new ArrayList<>();
- Map<String, Object> record = new HashMap<>();
- reader.startArray();
- while (reader.hasNext()) {
- reader.startObject();
- while (reader.hasNext()) {
- record.put(reader.readString(), reader.readObject());
- }
- reader.endObject();
- if (count <= batchTotal) {
- recordList.add(record);
- count ++;
- }
- if (batchTotal + 1 == count) {
- namedParameterJdbcTemplate.batchUpdate(insertSql, SqlParameterSourceUtils.createBatch(recordList));
- count = 1;
- recordList.clear();
- }
- }
- if (recordList.size()> 0) {
- namedParameterJdbcTemplate.batchUpdate(insertSql, SqlParameterSourceUtils.createBatch(recordList));
- recordList.clear();
- }
- reader.endArray();
- reader.close();
- return true;
- } catch (Exception e) {
- log.error(databaseName + "." + tableName + ": 插入失败");
- log.error("", e);
- return false;
- }
- }
- }
来源: http://www.bubuko.com/infodetail-3105897.html