From c99e2ccaff7e18e858b5b111ab9b4d78e543bc69 Mon Sep 17 00:00:00 2001 From: Marcin Kulik Date: Tue, 4 Jul 2017 13:41:43 +0200 Subject: [PATCH] Fix utf-8 handling in stdout_stream/2 --- lib/asciinema/asciicasts.ex | 14 +++++++------ lib/asciinema/string_utils.ex | 22 +++++++++++++++++++++ spec/fixtures/0.9.8/stdout-split | Bin 0 -> 54 bytes spec/fixtures/0.9.8/stdout-split.time | Bin 0 -> 62 bytes test/asciinema/asciicasts_test.exs | 7 +++++++ test/asciinema/string_utils_test.exs | 27 ++++++++++++++++++++++++++ 6 files changed, 64 insertions(+), 6 deletions(-) create mode 100644 lib/asciinema/string_utils.ex create mode 100644 spec/fixtures/0.9.8/stdout-split create mode 100644 spec/fixtures/0.9.8/stdout-split.time create mode 100644 test/asciinema/string_utils_test.exs diff --git a/lib/asciinema/asciicasts.ex b/lib/asciinema/asciicasts.ex index 628e598..23977d6 100644 --- a/lib/asciinema/asciicasts.ex +++ b/lib/asciinema/asciicasts.ex @@ -1,6 +1,6 @@ defmodule Asciinema.Asciicasts do import Ecto.Query, warn: false - alias Asciinema.{Repo, Asciicast, FileStore} + alias Asciinema.{Repo, Asciicast, FileStore, StringUtils} alias Asciinema.Asciicasts.PosterGenerator def get_asciicast!(id) when is_integer(id) do @@ -128,7 +128,8 @@ defmodule Asciinema.Asciicasts do defp open_stream_files(stdout_timing_path, stdout_data_path) do {open_stream_file(stdout_timing_path), - open_stream_file(stdout_data_path)} + open_stream_file(stdout_data_path), + ""} end defp open_stream_file(path) do @@ -146,13 +147,14 @@ defmodule Asciinema.Asciicasts do end end - defp generate_stream_elem({timing_file, data_file} = files) do + defp generate_stream_elem({timing_file, data_file, invalid_str} = files) do case IO.read(timing_file, :line) do line when is_binary(line) -> {delay, count} = parse_line(line) - case IO.read(data_file, count) do + case IO.binread(data_file, count) do text when is_binary(text) -> - {[{delay, text}], files} + {valid_str, invalid_str} = StringUtils.valid_part(invalid_str, text) + {[{delay, valid_str}], {timing_file, data_file, invalid_str}} otherwise -> {:error, otherwise} end @@ -161,7 +163,7 @@ defmodule Asciinema.Asciicasts do end end - defp close_stream_files({timing_file, data_file}) do + defp close_stream_files({timing_file, data_file, _}) do File.close(timing_file) File.close(data_file) end diff --git a/lib/asciinema/string_utils.ex b/lib/asciinema/string_utils.ex new file mode 100644 index 0000000..a456c49 --- /dev/null +++ b/lib/asciinema/string_utils.ex @@ -0,0 +1,22 @@ +defmodule Asciinema.StringUtils do + def valid_part(invalid_str, str) do + case String.chunk(invalid_str <> str, :valid) do + [] -> + {"", ""} + chunks -> + str = + chunks + |> Enum.take(Enum.count(chunks) - 1) + |> Enum.filter(&String.valid?/1) + |> Enum.join + + last = Enum.at(chunks, -1) + + if String.valid?(last) do + {str <> last, ""} + else + {str, last} + end + end + end +end diff --git a/spec/fixtures/0.9.8/stdout-split b/spec/fixtures/0.9.8/stdout-split new file mode 100644 index 0000000000000000000000000000000000000000..29ee40a8ea087ff8c7fcd89fab784d7cba9320ce GIT binary patch literal 54 zcmZ>Y%CIzaj8qGbbehiV%)r2S;0%KRgM&Z=2MZsA0)rB-l&urbjIM4Emco5{Pr3>g KDzxfJwE+Nb=?=#L literal 0 HcmV?d00001 diff --git a/spec/fixtures/0.9.8/stdout-split.time b/spec/fixtures/0.9.8/stdout-split.time new file mode 100644 index 0000000000000000000000000000000000000000..01c8c3ef0010ce848bd9d2f28a84a122dc995954 GIT binary patch literal 62 zcmV-E0Kxx4T4*^jL0KkKSyoAtS^xkC*Z>d^KmmV%AOJA~K+qbVjV7Lmw*0j!X2@_{ UDVhvm(6i$1NT&)C3du5SKyR-WV*mgE literal 0 HcmV?d00001 diff --git a/test/asciinema/asciicasts_test.exs b/test/asciinema/asciicasts_test.exs index 276557c..c2ff994 100644 --- a/test/asciinema/asciicasts_test.exs +++ b/test/asciinema/asciicasts_test.exs @@ -131,5 +131,12 @@ defmodule Asciinema.AsciicastsTest do assert :ok == Stream.run(stream) assert [{1.234567, "foobar"}, {0.123456, "baz"}] == Enum.take(stream, 2) end + + test "with bzipped files (utf-8 sequence split between frames)" do + stream = Asciicasts.stdout_stream("spec/fixtures/0.9.8/stdout-split.time", + "spec/fixtures/0.9.8/stdout-split") + assert :ok == Stream.run(stream) + assert [{1.234567, "xxżó"}, {0.123456, "łć"}, {2.0, "xx"}] == Enum.take(stream, 3) + end end end diff --git a/test/asciinema/string_utils_test.exs b/test/asciinema/string_utils_test.exs new file mode 100644 index 0000000..7e45e66 --- /dev/null +++ b/test/asciinema/string_utils_test.exs @@ -0,0 +1,27 @@ +defmodule Asciinema.StringUtilsTest do + use ExUnit.Case + + describe "valid_part/2" do + import Asciinema.StringUtils, only: [valid_part: 2] + + test "no accumulator, valid string" do + assert valid_part("", "foo") == {"foo", ""} + end + + test "no accumulator, partial utf-8 seq" do + assert valid_part("", <<0xc5>>) == {"", <<0xc5>>} + end + + test "no accumulator, valid string + partial utf-8 seq at the end" do + assert valid_part("", "foo" <> <<0xc5>>) == {"foo", <<0xc5>>} + end + + test "with accumulator, rest of utf-8 seq + valid string at the end" do + assert valid_part(<<0xc5>>, <<0x82>> <> "ćfoo") == {"łćfoo", ""} + end + + test "with accumulator, mixed valid/invalid string + partial utf-8 seq at the end" do + assert valid_part(<<0xc5>>, "x" <> <<0xc5, 0xc5>> <> "y" <> <<0xc5>>) == {"xy", <<0xc5>>} + end + end +end