需求:
1、cockroachdb数据库中的表order_sku_manage存在几十亿数据,需要导出成csv文件,以便迁移到hadoop上
2、分多个文件存储,每个文件最大存放数据量:5亿
说明:
需要用到的模块:os、csv、psycopg2
# coding:utf-8
import os
import csv
import psycopg2
# 配置cockroach数据库链接参数
class CockDBConfig(object):
host = "11.3.117.106"
port = 3359
database = "test"
user = "cock_ro"
password = "abcd1234"
conn = psycopg2.connect(
port=CockDBConfig.port,
host=CockDBConfig.host,
database=CockDBConfig.database,
user=CockDBConfig.user,
password=CockDBConfig.password
)
conn.set_session(autocommit=True)
cursor = conn.cursor()
# 定义存放路径,存放到py文件所在路径的result文件夹下
file_path = "{0}{1}result{1}".format(os.path.dirname(os.path.abspath(__file__)), os.sep)
# 定义表名、主键列、比当前最小主键更小的id值、每次查询的返回的数据量
table_name = "order_sku_manage"
key_column = "order_sku"
min_id = "0000000000000000000"
limit_num = 10000
# 定义每个文件存放多少行数据
batch_num = 500000000
file_num = 1
sql = "select * from {0} where {1} > '{2}' order by {1} limit {3}"
sel_times = 1
while True:
exec_sql = sql.format(table_name, key_column, min_id, limit_num)
cursor.execute(exec_sql)
rows = cursor.fetchall()
# 如果查询返回的结果集为空,则退出
if not rows:
break
else:
# 将每次查询结果集中的最大id值赋值给min_id,以便进行下一次查询
# 写入数据量超过定义文件的存储行数时,写入到新文件中
min_id = rows[-1][0]
if sel_times*limit_num > batch_num:
sel_times = 1
file_num += 1
data_file = "{}{}_data_{:0>2}.csv".format(file_path, table_name, file_num)
with open(data_file, "a+", newline="", encoding="utf-8") as csv_file:
w = csv.writer(csv_file)
w.writerows(rows)
sel_times += 1
cursor.close()
conn.close()
知识兔