package com.alibaba.dashscope.tokenizers;

import com.alibaba.dashscope.exception.NoSpecialTokenExists;
import com.alibaba.dashscope.exception.UnSupportedSpecialTokenMode;
import com.alibaba.dashscope.utils.StringUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:com/alibaba/dashscope/tokenizers/QwenTokenizer.class */
public class QwenTokenizer implements Tokenizer {
    private static final String SPECIAL_START = "<|";
    private static final String SPECIAL_END = "|>";
    private static final String ENDOFTEXT = "<|endoftext|>";
    private static final String IMSTART = "<|im_start|>";
    private static final String IMEND = "<|im_end|>";
    private static final String PATTEN_STRING = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
    private static final int SPECIAL_START_ID = 151643;
    private static final String TOKEN_RANK_SEPARATOR = " ";
    private static final String vocabularyBpeFile = "qwen.tiktoken";
    private static final Map<EncodeBytesEntity, Integer> mergeableRanks;
    private static final Map<String, Integer> specialTokens;
    private static final byte[][] decodeMap;
    static final /* synthetic */ boolean $assertionsDisabled;

    private EncodeBytesEntity mergePair(EncodeBytesEntity encodeBytesEntity, EncodeBytesEntity encodeBytesEntity2) {
        byte[] copyOf = Arrays.copyOf(encodeBytesEntity.bytes, encodeBytesEntity.bytes.length + encodeBytesEntity2.bytes.length);
        System.arraycopy(encodeBytesEntity2.bytes, 0, copyOf, encodeBytesEntity.bytes.length, encodeBytesEntity2.bytes.length);
        return new EncodeBytesEntity(copyOf);
    }

    private EncodeBytesEntity getLowestIndexBytePair(EncodeBytesEntity[] encodeBytesEntityArr) {
        ArrayList arrayList = new ArrayList();
        Integer num = Integer.MAX_VALUE;
        EncodeBytesEntity encodeBytesEntity = null;
        for (int i = 0; i < encodeBytesEntityArr.length - 1; i++) {
            EncodeBytesEntity mergePair = mergePair(encodeBytesEntityArr[i], encodeBytesEntityArr[i + 1]);
            if (arrayList.indexOf(mergePair) == -1) {
                Integer num2 = mergeableRanks.get(mergePair);
                if (num2 == null) {
                    mergePair.rank = Integer.MAX_VALUE;
                } else {
                    mergePair.rank = num2.intValue();
                    if (num2.intValue() < num.intValue()) {
                        num = num2;
                        encodeBytesEntity = mergePair;
                    }
                }
                arrayList.add(mergePair);
            }
        }
        return encodeBytesEntity;
    }

    private EncodeBytesEntity[] merge(EncodeBytesEntity[] encodeBytesEntityArr, EncodeBytesEntity encodeBytesEntity) {
        EncodeBytesEntity[] encodeBytesEntityArr2 = new EncodeBytesEntity[encodeBytesEntityArr.length];
        int i = 0;
        int i2 = 0;
        while (i2 < encodeBytesEntityArr.length) {
            if (i2 >= encodeBytesEntityArr.length - 1) {
                int i3 = i;
                i++;
                encodeBytesEntityArr2[i3] = encodeBytesEntityArr[i2];
                i2++;
            } else if (mergePair(encodeBytesEntityArr[i2], encodeBytesEntityArr[i2 + 1]).equals(encodeBytesEntity)) {
                int i4 = i;
                i++;
                encodeBytesEntityArr2[i4] = encodeBytesEntity;
                i2 += 2;
            } else {
                int i5 = i;
                i++;
                encodeBytesEntityArr2[i5] = encodeBytesEntityArr[i2];
                i2++;
            }
        }
        return (EncodeBytesEntity[]) Arrays.copyOfRange(encodeBytesEntityArr2, 0, i);
    }

    private List<Integer> encodeChunk(String str) {
        EncodeBytesEntity lowestIndexBytePair;
        byte[] bytes = str.getBytes(StandardCharsets.UTF_8);
        EncodeBytesEntity[] encodeBytesEntityArr = new EncodeBytesEntity[bytes.length];
        int i = 0;
        for (byte b : bytes) {
            EncodeBytesEntity encodeBytesEntity = new EncodeBytesEntity(new byte[]{b});
            encodeBytesEntity.rank = mergeableRanks.get(encodeBytesEntity).intValue();
            int i2 = i;
            i++;
            encodeBytesEntityArr[i2] = encodeBytesEntity;
        }
        ArrayList arrayList = new ArrayList();
        if (encodeBytesEntityArr.length < 2) {
            for (EncodeBytesEntity encodeBytesEntity2 : encodeBytesEntityArr) {
                arrayList.add(Integer.valueOf(encodeBytesEntity2.rank));
            }
            return arrayList;
        }
        while (encodeBytesEntityArr.length >= 2 && (lowestIndexBytePair = getLowestIndexBytePair(encodeBytesEntityArr)) != null) {
            encodeBytesEntityArr = merge(encodeBytesEntityArr, lowestIndexBytePair);
        }
        for (EncodeBytesEntity encodeBytesEntity3 : encodeBytesEntityArr) {
            arrayList.add(Integer.valueOf(encodeBytesEntity3.rank));
        }
        return arrayList;
    }

    @Override // com.alibaba.dashscope.tokenizers.Tokenizer
    public List<Integer> encodeOrdinary(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = Pattern.compile(PATTEN_STRING).matcher(str);
        while (matcher.find()) {
            arrayList.addAll(encodeChunk(matcher.group()));
        }
        return arrayList;
    }

    /* JADX WARN: Multi-variable type inference failed */
    private List<String> splitWithSpecial(String str) {
        List arrayList = new ArrayList();
        if (str.contains(SPECIAL_START) && str.contains(SPECIAL_END)) {
            arrayList = StringUtils.splitByStrings(str, specialTokens.keySet());
        } else {
            arrayList.add(str);
        }
        return arrayList;
    }

    /* JADX WARN: Multi-variable type inference failed */
    @Override // com.alibaba.dashscope.tokenizers.Tokenizer
    public List<Integer> encode(String str, String str2) throws NoSpecialTokenExists, UnSupportedSpecialTokenMode {
        Map linkedHashMap;
        if (str2 == null) {
            str2 = "all";
        }
        if ("all".equals(str2)) {
            linkedHashMap = specialTokens;
        } else if ("none".equals(str2)) {
            linkedHashMap = new LinkedHashMap();
        } else {
            if (!"none_raise".equals(str2)) {
                throw new UnSupportedSpecialTokenMode(String.format("UnSupport allowedSpecial: %s", str2));
            }
            linkedHashMap = new LinkedHashMap();
            boolean z = false;
            Iterator<String> it = specialTokens.keySet().iterator();
            while (true) {
                if (!it.hasNext()) {
                    break;
                }
                if (str.indexOf(it.next()) != -1) {
                    z = true;
                    break;
                }
            }
            if (!z) {
                throw new NoSpecialTokenExists(String.format("No special token in %s", str));
            }
        }
        if (linkedHashMap.isEmpty()) {
            return encodeOrdinary(str);
        }
        List<String> splitWithSpecial = splitWithSpecial(str);
        ArrayList arrayList = new ArrayList();
        for (String str3 : splitWithSpecial) {
            if (linkedHashMap.containsKey(str3)) {
                arrayList.add(linkedHashMap.get(str3));
            } else {
                arrayList.addAll(encodeOrdinary(str3));
            }
        }
        return arrayList;
    }

    @Override // com.alibaba.dashscope.tokenizers.Tokenizer
    public String decode(List<Integer> list) {
        StringBuilder sb = new StringBuilder();
        Iterator<Integer> it = list.iterator();
        while (it.hasNext()) {
            sb.append(new String(decodeMap[it.next().intValue()], StandardCharsets.UTF_8));
        }
        return sb.toString();
    }

    /* JADX WARN: Type inference failed for: r0v28, types: [byte[], byte[][]] */
    static {
        $assertionsDisabled = !QwenTokenizer.class.desiredAssertionStatus();
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        int i = SPECIAL_START_ID + 1;
        linkedHashMap.put(ENDOFTEXT, Integer.valueOf(SPECIAL_START_ID));
        int i2 = i + 1;
        linkedHashMap.put(IMSTART, Integer.valueOf(i));
        int i3 = i2 + 1;
        linkedHashMap.put(IMEND, Integer.valueOf(i2));
        for (int i4 = 0; i4 < 205; i4++) {
            int i5 = i3;
            i3++;
            linkedHashMap.put(String.format("<|extra_%d|>", Integer.valueOf(i4)), Integer.valueOf(i5));
        }
        specialTokens = Collections.unmodifiableMap(linkedHashMap);
        mergeableRanks = new LinkedHashMap();
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(QwenTokenizer.class.getClassLoader().getResourceAsStream(vocabularyBpeFile), StandardCharsets.UTF_8));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    decodeMap = new byte[mergeableRanks.size() + specialTokens.size()];
                    for (Map.Entry<EncodeBytesEntity, Integer> entry : mergeableRanks.entrySet()) {
                        decodeMap[entry.getValue().intValue()] = Arrays.copyOf(entry.getKey().bytes, entry.getKey().bytes.length);
                    }
                    for (Map.Entry<String, Integer> entry2 : specialTokens.entrySet()) {
                        byte[] bytes = entry2.getKey().getBytes(StandardCharsets.UTF_8);
                        decodeMap[entry2.getValue().intValue()] = Arrays.copyOf(bytes, bytes.length);
                    }
                    return;
                }
                String[] split = readLine.split(TOKEN_RANK_SEPARATOR);
                if (!$assertionsDisabled && split.length != 2) {
                    throw new AssertionError("Invalid line in qwen.tiktoken: " + readLine);
                }
                byte[] decode = Base64.getDecoder().decode(split[0].getBytes(StandardCharsets.UTF_8));
                int intValue = Integer.valueOf(split[1]).intValue();
                mergeableRanks.put(new EncodeBytesEntity(decode, intValue), Integer.valueOf(intValue));
            }
        } catch (IOException e) {
            throw new RuntimeException("Could not load qwen.tiktoken from resources", e);
        }
    }
}
