python cockroachdb导出csv文件

需求:

1、cockroachdb数据库中的表order_sku_manage存在几十亿数据,需要导出成csv文件,以便迁移到hadoop上

2、分多个文件存储,每个文件最大存放数据量:5亿

说明:

需要用到的模块:os、csv、psycopg2

# coding:utf-8
import os
import csv
import psycopg2


# 配置cockroach数据库链接参数
class CockDBConfig(object):
    host = "11.3.117.106"
    port = 3359
    database = "test"
    user = "cock_ro"
    password = "abcd1234"


conn = psycopg2.connect(
    port=CockDBConfig.port,
    host=CockDBConfig.host,
    database=CockDBConfig.database,
    user=CockDBConfig.user,
    password=CockDBConfig.password
)
conn.set_session(autocommit=True)
cursor = conn.cursor()

# 定义存放路径,存放到py文件所在路径的result文件夹下
file_path = "{0}{1}result{1}".format(os.path.dirname(os.path.abspath(__file__)), os.sep)

# 定义表名、主键列、比当前最小主键更小的id值、每次查询的返回的数据量
table_name = "order_sku_manage"
key_column = "order_sku"
min_id = "0000000000000000000"
limit_num = 10000

# 定义每个文件存放多少行数据
batch_num = 500000000
file_num = 1

sql = "select * from {0} where {1} > '{2}' order by {1} limit {3}"

sel_times = 1
while True:
    exec_sql = sql.format(table_name, key_column, min_id, limit_num)
    cursor.execute(exec_sql)
    rows = cursor.fetchall()

    # 如果查询返回的结果集为空,则退出
    if not rows:
        break
    else:
        # 将每次查询结果集中的最大id值赋值给min_id,以便进行下一次查询
        # 写入数据量超过定义文件的存储行数时,写入到新文件中
        min_id = rows[-1][0]
        if sel_times*limit_num > batch_num:
            sel_times = 1
            file_num += 1

        data_file = "{}{}_data_{:0>2}.csv".format(file_path, table_name, file_num)
        with open(data_file, "a+", newline="", encoding="utf-8") as csv_file:
            w = csv.writer(csv_file)
            w.writerows(rows)

        sel_times += 1

cursor.close()
conn.close()
知识兔
计算机