安娜档案馆读秀数据的元数据竟然就有 458G

https://zh.annas-archive.org/torrents

如题,解压前 26G ,解压后 458G

让 GPT 写了个脚本转到 sqlite ,看看会有多大。

脚本如下:

# -*- coding: utf-8 -*-
import sqlite3
import json

def create_table_from_jsonl(file_path, db_path):

连接到 SQLite 数据库

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# 读取第一行以推断表结构
with open(file_path, 'r', encoding='utf-8') as file:
    first_line = file.readline()
    first_record = json.loads(first_line)
    columns = list(first_record.keys())

    # 创建具有推断列的表
    columns_definition = ", ".join([f"{col} TEXT" for col in columns])
    cursor.execute(f"CREATE TABLE IF NOT EXISTS data ({columns_definition});")
    conn.commit()

write_count = 0  # 写入计数器

# 插入其余数据
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        record = json.loads(line)
        placeholders = ", ".join(["?"] * len(columns))
        values = [json.dumps(record.get(col, None)) if isinstance(record.get(col, None), (dict, list)) else record.get(col, None) for col in columns]

        # 只在值数量和列数量一致时执行插入操作
        if len(values) == len(columns):
            cursor.execute(f"INSERT INTO data ({', '.join(columns)}) VALUES ({placeholders});", values)
            write_count += 1
            print(f"这是第 {write_count} 次写入")

        else:
            print(f"Skipping record due to mismatched column count: {record}")

        # 定期提交以保存更改并释放内存
        if write_count % 1000 == 0:
            conn.commit()

# 最终提交并关闭连接
conn.commit()
conn.close()

上传的 JSONL 文件路径

file_path = '/mnt/data/sample.jsonl'

SQLite 数据库文件路径

db_path = '/mnt/data/sample.db'

创建表并插入数据

create_table_from_jsonl(file_path, db_path)

print("数据已成功写入 SQLite 数据库。")

ps V 站这个 markdown 没有语法高亮的么……