blob: 42acfbdcea09239fb2684836116362d2a9491bdf [file] [log] [blame]
//===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Lempel–Ziv–Welch encoding/decoding
//
//===----------------------------------------------------------------------===//
#ifndef SANITIZER_LZW_H
#define SANITIZER_LZW_H
#include "sanitizer_dense_map.h"
namespace __sanitizer {
using LzwCodeType = u32;
template <class T, class ItIn, class ItOut>
ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
using Substring =
detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
// Sentinel value for substrings of len 1.
static constexpr LzwCodeType kNoPrefix =
Min(DenseMapInfo<Substring>::getEmptyKey().first,
DenseMapInfo<Substring>::getTombstoneKey().first) -
1;
DenseMap<Substring, LzwCodeType> prefix_to_code;
{
// Add all substring of len 1 as initial dictionary.
InternalMmapVector<T> dict_len1;
for (auto it = begin; it != end; ++it)
if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
dict_len1.push_back(*it);
// Slightly helps with later delta encoding.
Sort(dict_len1.data(), dict_len1.size());
// For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
// just generate them.
*out = dict_len1.size();
++out;
for (uptr i = 0; i != dict_len1.size(); ++i) {
// Remap after the Sort.
prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
*out = dict_len1[i];
++out;
}
CHECK_EQ(prefix_to_code.size(), dict_len1.size());
}
if (begin == end)
return out;
// Main LZW encoding loop.
LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
++begin;
for (auto it = begin; it != end; ++it) {
// Extend match with the new item.
auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
if (ins.second) {
// This is a new substring, but emit the code for the current match
// (before extend). This allows LZW decoder to recover the dictionary.
*out = match;
++out;
// Reset the match to a single item, which must be already in the map.
match = prefix_to_code.find({kNoPrefix, *it})->second;
} else {
// Already known, use as the current match.
match = ins.first->second;
}
}
*out = match;
++out;
return out;
}
template <class T, class ItIn, class ItOut>
ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
if (begin == end)
return out;
// Load dictionary of len 1 substrings. Theses correspont to lowest codes.
InternalMmapVector<T> dict_len1(*begin);
++begin;
if (begin == end)
return out;
for (auto& v : dict_len1) {
v = *begin;
++begin;
}
// Substrings of len 2 and up. Indexes are shifted because [0,
// dict_len1.size()) stored in dict_len1. Substings get here after being
// emitted to the output, so we can use output position.
InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
code_to_substr;
// Copies already emitted substrings into the output again.
auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
if (code < dict_len1.size()) {
*out = dict_len1[code];
++out;
return out;
}
const auto& s = code_to_substr[code - dict_len1.size()];
for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
return out;
};
// Returns lens of the substring with the given code.
auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
if (code < dict_len1.size())
return 1;
const auto& s = code_to_substr[code - dict_len1.size()];
return s.second - s.first;
};
// Main LZW decoding loop.
LzwCodeType prev_code = *begin;
++begin;
out = copy(prev_code, out);
for (auto it = begin; it != end; ++it) {
LzwCodeType code = *it;
auto start = out;
if (code == dict_len1.size() + code_to_substr.size()) {
// Special LZW case. The code is not in the dictionary yet. This is
// possible only when the new substring is the same as previous one plus
// the first item of the previous substring. We can emit that in two
// steps.
out = copy(prev_code, out);
*out = *start;
++out;
} else {
out = copy(code, out);
}
// Every time encoded emits the code, it also creates substing of len + 1
// including the first item of the just emmited substring. Do the same here.
uptr len = code_to_len(prev_code);
code_to_substr.push_back({start - len, start + 1});
prev_code = code;
}
return out;
}
} // namespace __sanitizer
#endif