Check out this ShareGPT conversation

Model: GPT-4

``` import json import asyncio import aiohttp import openai import re MAX_TOKENS = 4096 INPUT_PATH = "input_json/blu3mo_filtered.json" OUTPUT_PATH = "output_json/blu3mo_filtered.json" PROMPT = """ You are a language translator. Target Language: English # Task Translate texts from the source language to English, and output the translated texts. # Rules - Always preserve \\n and \\s. - Keep brackets unchanged: Brackets of [text] and [text.icon] must be kept. The content inside square brackets must never be changed. - Preserve markup symbols like >, `, []. # Example Original Text: [りんご]\\n\\s\\sバナナ\\n\\s\\s\\s[ダイアモンド.icon] Translated Text: [apple]\\n\\s\\sbanana\\n\\s\\s\\s[diamond.icon] """ import re import re async def async_translate(session, text): # Replace leading spaces/tabs/full width spaces with \s text = re.sub(r'^([ \t　]+)', lambda m: '\\s' * len(m.group(1)), text, flags=re.MULTILINE) # Replace newlines with \n text = text.replace('\n', '\\n') headers = { "Content-Type": "application/json", "Authorization": f"Bearer {openai.api_key}" } data = { "model": "gpt-3.5-turbo", "messages": [ {"role": "system", "content": PROMPT}, {"role": "user", "content": text} ], "temperature": 0, } async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data) as resp: response = await resp.json() print(response) translated_text = response["choices"][0]["message"]["content"] # Replace \n back to newline translated_text = translated_text.replace('\\n', '\n') # Replace \s back to spaces translated_text = re.sub(r'\\s', ' ', translated_text) return translated_text async def translate_titles(session, title_list): translated_titles = [] title_chunk = "" for title in title_list: if len(title_chunk) + len(title) + 1 < MAX_TOKENS: title_chunk += title + "\n" else: translated_chunk = await async_translate(session, title_chunk) translated_titles.extend(translated_chunk.split("\n")[:-1]) title_chunk = title + "\n" if title_chunk: translated_chunk = await async_translate(session, title_chunk) translated_titles.extend(translated_chunk.split("\n")[:-1]) return translated_titles async def translate_page(session, page_text): if len(page_text) <= MAX_TOKENS: return await async_translate(session, page_text) else: split_point = page_text.rfind("\n", 0, MAX_TOKENS) first_half = page_text[:split_point] second_half = page_text[split_point + 1:] first_half_translated = await async_translate(session, first_half) second_half_translated = await translate_page(session, second_half) return first_half_translated + "\n" + second_half_translated async def translate_json_file(input_file, output_file): with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) title_translation_dict = {} # Translate page titles title_list = [page['title'] for page in data['pages']] async with aiohttp.ClientSession() as session: translated_titles = await translate_titles(session, title_list) for original_title, translated_title in zip(title_list, translated_titles): title_translation_dict[original_title] = translated_title for page, translated_title in zip(data['pages'], translated_titles): page['title'] = translated_title # Translate lines with translated titles replaced translation_tasks = [] for page in data['pages']: page_text = "\n".join(page['lines']) for jp_title, en_title in title_translation_dict.items(): page_text = page_text.replace(f"{jp_title}", f"{en_title}") translation_tasks.append(translate_page(session, page_text)) translated_texts = await asyncio.gather(*translation_tasks) for page, translated_text in zip(data['pages'], translated_texts): page['lines'] = translated_text.split("\n") with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) # Replace 'input.json' and 'output.json' with your desired file paths async def main(): await translate_json_file(INPUT_PATH, OUTPUT_PATH) asyncio.run(main()) ``` I want the code to be using tiktoken to calculate token length when checking if it's above MAX_TOKENS or not. Fix the code.

To use tiktoken to calculate token length, you need to install it first. You can install tiktoken via pip:

bashCopy code
pip install tiktoken

Then, you can modify the code by importing tiktoken and implementing a function to count tokens in a text:

pythonCopy code
from tiktoken import Tokenizer, TokenizerException

# ...

def count_tokens(text):
    tokenizer = Tokenizer()
    try:
        tokens = list(tokenizer.tokenize(text))
    except TokenizerException:
        return 0
    return len(tokens)

# ...

Finally, replace the token length checks in the translate_titles function with the new count_tokens function:

pythonCopy code
async def translate_titles(session, title_list):
    translated_titles = []
    title_chunk = ""

    for title in title_list:
        if count_tokens(title_chunk) + count_tokens(title) + 1 < MAX_TOKENS:
            title_chunk += title + "\n"
        else:
            translated_chunk = await async_translate(session, title_chunk)
            translated_titles.extend(translated_chunk.split("\n")[:-1])
            title_chunk = title + "\n"

    if title_chunk:
        translated_chunk = await async_translate(session, title_chunk)
        translated_titles.extend(translated_chunk.split("\n")[:-1])

    return translated_titles

Here's the complete modified code with tiktoken integration:

pythonCopy code
import json
import asyncio
import aiohttp
import openai
import re
from tiktoken import Tokenizer, TokenizerException

MAX_TOKENS = 4096
INPUT_PATH = "input_json/blu3mo_filtered.json"
OUTPUT_PATH = "output_json/blu3mo_filtered.json"

PROMPT = """
You are a language translator.
Target Language: English

# Task
Translate texts from the source language to English, and output the translated texts.

# Rules
- Always preserve \\n and \\s.
- Keep brackets unchanged: Brackets of [text] and [text.icon] must be kept. The content inside square brackets must never be changed.
- Preserve markup symbols like >, `, [].

# Example
Original Text:
[りんご]\\n\\s\\sバナナ\\n\\s\\s\\s[ダイアモンド.icon]

Translated Text:
[apple]\\n\\s\\sbanana\\n\\s\\s\\s[diamond.icon]
"""

def count_tokens(text):
    tokenizer = Tokenizer()
    try:
        tokens = list(tokenizer.tokenize(text))
    except TokenizerException:
        return 0
    return len(tokens)

async def async_translate(session, text):
    # Replace leading spaces/tabs/full width spaces with \s
    text = re.sub(r'^([ \t　]+)', lambda m: '\\s' * len(m.group(1)), text, flags=re.MULTILINE)

    # Replace newlines with \n
    text = text.replace('\n', '\\n')

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai.api_key}"
    }

    data = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {"role": "system", "content": PROMPT},
            {"role": "user", "content": text}
        ],
        "temperature": 0,
    }

    async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data) as resp:
        response = await resp.json()
        print(response)
        translated_text = response["choices"][0]["message"]["content"]

    # Replace \

4 / 4could you continue?

Certainly! Here's the complete modified code with tiktoken integration:

pythonCopy code
import json
import asyncio
import aiohttp
import openai
import re
from tiktoken import Tokenizer, TokenizerException

MAX_TOKENS = 4096
INPUT_PATH = "input_json/blu3mo_filtered.json"
OUTPUT_PATH = "output_json/blu3mo_filtered.json"

PROMPT = """
You are a language translator.
Target Language: English

# Task
Translate texts from the source language to English, and output the translated texts.

# Rules
- Always preserve \\n and \\s.
- Keep brackets unchanged: Brackets of [text] and [text.icon] must be kept. The content inside square brackets must never be changed.
- Preserve markup symbols like >, `, [].

# Example
Original Text:
[りんご]\\n\\s\\sバナナ\\n\\s\\s\\s[ダイアモンド.icon]

Translated Text:
[apple]\\n\\s\\sbanana\\n\\s\\s\\s[diamond.icon]
"""

def count_tokens(text):
    tokenizer = Tokenizer()
    try:
        tokens = list(tokenizer.tokenize(text))
    except TokenizerException:
        return 0
    return len(tokens)

async def async_translate(session, text):
    # Replace leading spaces/tabs/full width spaces with \s
    text = re.sub(r'^([ \t　]+)', lambda m: '\\s' * len(m.group(1)), text, flags=re.MULTILINE)

    # Replace newlines with \n
    text = text.replace('\n', '\\n')

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai.api_key}"
    }

    data = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {"role": "system", "content": PROMPT},
            {"role": "user", "content": text}
        ],
        "temperature": 0,
    }

    async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data) as resp:
        response = await resp.json()
        print(response)
        translated_text = response["choices"][0]["message"]["content"]

    # Replace \n back to newline
    translated_text = translated_text.replace('\\n', '\n')

    # Replace \s back to spaces
    translated_text = re.sub(r'\\s', ' ', translated_text)

    return translated_text

async def translate_titles(session, title_list):
    translated_titles = []
    title_chunk = ""

    for title in title_list:
        if count_tokens(title_chunk) + count_tokens(title) + 1 < MAX_TOKENS:
            title_chunk += title + "\n"
        else:
            translated_chunk = await async_translate(session, title_chunk)
            translated_titles.extend(translated_chunk.split("\n")[:-1])
            title_chunk = title + "\n"

    if title_chunk:
        translated_chunk = await async_translate(session, title_chunk)
        translated_titles.extend(translated_chunk.split("\n")[:-1])

    return translated_titles

async def translate_page(session, page_text):
    if count_tokens(page_text) <= MAX_TOKENS:
        return await async_translate(session, page_text)
    else:
        split_point = page_text.rfind("\n", 0, MAX_TOKENS)
        first_half = page_text[:split_point]
        second_half = page_text[split_point + 1:]
        first_half_translated = await async_translate(session, first_half)
        second_half_translated = await translate_page(session, second_half

could you continue?

Certainly! Here's the complete modified code with tiktoken integration continued:

pythonCopy code
        return first_half_translated + "\n" + second_half_translated

async def translate_json_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    title_translation_dict = {}

    # Translate page titles
    title_list = [page['title'] for page in data['pages']]

    async with aiohttp.ClientSession() as session:
        translated_titles = await translate_titles(session, title_list)

        for original_title, translated_title in zip(title_list, translated_titles):
            title_translation_dict[original_title] = translated_title

        for page, translated_title in zip(data['pages'], translated_titles):
            page['title'] = translated_title

        # Translate lines with translated titles replaced
        translation_tasks = []

        for page in data['pages']:
            page_text = "\n".join(page['lines'])

            for jp_title, en_title in title_translation_dict.items():
                page_text = page_text.replace(f"{jp_title}", f"{en_title}")

            translation_tasks.append(translate_page(session, page_text))

        translated_texts = await asyncio.gather(*translation_tasks)

        for page, translated_text in zip(data['pages'], translated_texts):
            page['lines'] = translated_text.split("\n")

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

async def main():
    await translate_json_file(INPUT_PATH, OUTPUT_PATH)

asyncio.run(main())

This code snippet continues from where the previous snippet left off, finishing the implementation of the translate_page and translate_json_file functions. The main function is then called to execute the translation process using the specified input and output file paths.

I believe split_point = page_text.rfind("\n", 0, MAX_TOKENS) is a problem, because we have to split text at the place where number of tokens is MAX_TOKENS, not the char count.

You're right. To split the text at a position where the token count is MAX_TOKENS, you can modify the translate_page function as follows:

pythonCopy code
async def translate_page(session, page_text):
    if count_tokens(page_text) <= MAX_TOKENS:
        return await async_translate(session, page_text)
    else:
        split_point = 0
        token_count = 0
        tokenizer = Tokenizer()
        for token, index in tokenizer.gen_tokens(page_text):
            token_count += 1
            if token_count >= MAX_TOKENS:
                split_point = index
                break

        first_half = page_text[:split_point].rstrip()
        second_half = page_text[split_point:].lstrip()
        first_half_translated = await async_translate(session, first_half)
        second_half_translated = await translate_page(session, second_half)
        return first_half_translated + "\n" + second_half_translated

In this version of the translate_page function, we use tiktoken's gen_tokens function to iterate through the tokens in page_text and their corresponding indices. We keep track of the token count and stop when it reaches MAX_TOKENS. Then, we split the text at that position and recursively translate both halves.