![[personal profile]](https://www.dreamwidth.org/img/silk/identity/user.png)
![[community profile]](https://www.dreamwidth.org/img/silk/identity/community.png)
// UTF-32 is a Unicode encoding that uses exactly four bytes for each
// codepoint. Unlike UTF-8 (used widely on the Web) or UTF-16 (used in Java
// and .NET), each codepoint / character takes up the same number of bytes,
// making it much easier to do string processing (counting, substrings) based
// on character count.
// One common application is applying styling or custom behavior to Twitter
// posts based on the data provided by the API:
// https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/entities
// Copyright 2022 Isaac Schemm
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
// software and associated documentation files (the "Software"), to deal in the Software
// without restriction, including without limitation the rights to use, copy, modify,
// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
namespace ISchemm.UTF32
open System
open System.Collections
open System.Collections.Generic
open System.Runtime.InteropServices
open System.Text
open Microsoft.FSharp.NativeInterop
// The type Char32 is used to store a single Unicode character. Like any F#
// record, structural equality and comparison is implemented by the compiler.
// By compiling the Char32 record as a struct (instead of a class), we can
// allow client code (particularily in C#) to use the Char32 type in raw
// ("unsafe") memory pointers. The size of the struct will be 4 bytes, the
// same as the type of its one underlying field (a 32-bit integer).
[<Struct>]
type Char32 = {
// Because this is an F# record field, "value" will be required in the
// constructor and read-only once the object is created; instead of
// replacing this field, the user should replace the Char32 itself with
// another Char32. The goal is to make this type as analogous to built-in
// primitive types (char, int) as possible.
Value: int32
} with
// Keeping in mind that the client code might use pointers to access this
// data, it seems appropriate that casting a Char32* to an int* should
// reveal the Unicode codepoints themselves, which means the byte ordering
// used should match the system's architecture.
static member Encoding = new UTF32Encoding(not BitConverter.IsLittleEndian, false)
// Provides an alternative to simply calling the constructor that may
// convey intention more clearly.
static member FromInt32 v = { Value = v }
// Like the built-in char type, a Char32's string representation should be
// a string containing only the character itself. (Because .NET strings
// are UTF-16, this string may contain two bytes.)
override this.ToString () =
this.Value
|> BitConverter.GetBytes
|> Char32.Encoding.GetString
// Char32Array provides a set of static functions for converting to and from
// arrays of Char32 objects.
module Char32Array =
// Converts from a byte array containing UTF-32 encoded data (in the
// native byte order) to a Char32 array.
let FromByteArray (bArr: byte[]): Char32[] =
// In UTF-32, all codepoints take up four bytes, so the input array's
// length must be a multiple of four.
if bArr.Length % sizeof<Char32> <> 0 then
raise (FormatException $"Length of array must be divisible by {sizeof<Char32>}")
if bArr.Length = 0 then
// An empty array.
[||]
else
// Create an array to hold the copied data.
let cArr = Array.zeroCreate<Char32> (bArr.Length / sizeof<Char32>)
// Unmanaged memory access is used to copy the raw bytes into the
// Char32 array. The "fixed" statement is used to define a
// pointer (cPtr) to the start of cArr, which the garbage
// collector cannot move until cPtr goes out of scope. (The "in"
// keyword comes from ML "let ... in ... end", while "use" is
// roughly equivalent to C# "using".)
use cPtr = fixed cArr in Marshal.Copy (bArr, 0, NativePtr.toNativeInt cPtr, bArr.Length)
cArr
// Converts a .NET (UTF-16) string to a Char32 array, by converting it to
// a byte array then calling FromByteArray.
let FromString (str: string): Char32[] =
str
|> Char32.Encoding.GetBytes
|> FromByteArray
// Converts a Char32 array to a byte array.
let GetByteArray (cArr: Char32[]): byte[] =
if cArr.Length = 0 then
// An empty array.
[||]
else
// Create an array to hold the copied data.
let bArr = Array.zeroCreate<byte> (cArr.Length * sizeof<Char32>)
// Copy the data in the same way as above.
use cPtr = fixed cArr in Marshal.Copy (NativePtr.toNativeInt cPtr, bArr, 0, bArr.Length)
bArr
// Converts a Char32 array to a .NET (UTF-16) string, by calling
// GetByteArray then converting the result to a string.
let GetString (array: Char32[]): string =
array
|> GetByteArray
|> Char32.Encoding.GetString
// The String32 type is an immutable string of Char32 objects, intended to
// mirror native .NET strings (at least in the functionality that's been
// implemented here.) The version of this code on NuGet made this type a
// struct, but I don't think this is necessary; unlike Char32, its sole field
// is not a value type.
type String32 = {
// Using an F# list inside an F# record provides structural equality and
// comparison and also ensures that the object is immutable.
List: Char32 list
} with
// A static method to convert an F# list of Char32 objects to a String32.
static member FromList (list: Char32 list) = { List = list }
// A static method to convert any IEnumerable<Char32> (such as a .NET list
// or array) to a String32. Useful from C# or VB.NET; having to use F#'s
// ListModule from these languages would not be immediately obvious.
static member FromEnumerable (src: seq<Char32>) =
src
|> List.ofSeq
|> String32.FromList
// This is here to mirror FromEnumerable; the String32 itself is returned,
// since String32 implements IEnumerable<Char32> (see below).
static member ToEnumerable (src: String32) =
src :> seq<Char32>
// A static method to convert a .NET string to a String32, by using
// Char32Array.FromString and then FromEnumerable.
static member FromString (str: string) =
str
|> Char32Array.FromString
|> String32.FromEnumerable
// A static method to convert a String32 to a .NET string, by copying the
// underlying list to an array and then using Char32Array.GetString.
static member ToString (src: String32) =
src.List
|> Array.ofList
|> Char32Array.GetString
// An empty String32.
static member Empty = { List = [] }
// Gets the length of the String32 (in characters / codepoints), which is
// simply the length of the underlying list of Char32 objects.
member this.Length = List.length this.List
// Allows indexing into the String32.
member this.Item index = this.List.[index]
// Allows taking substrings of the String32 from F# using slice syntax.
// Implemented by simply taking a slice of the underlying list.
member this.GetSlice (startIndex, endIndex) =
this.List.GetSlice (startIndex, endIndex)
|> String32.FromList
// A more traditional set of substring methods, implemented by using the
// F# indexing and slice syntax that Item and GetSlice implement.
member this.Substring (startIndex) = this.[startIndex..]
member this.Substring (startIndex, length) = this.[startIndex..startIndex + length - 1]
// The string representation for a String32, which is the same string,
// just converted to a .NET (UTF-16) string.
override this.ToString () = String32.ToString this
// String32 implements IReadOnlyList<Char32> (which itself implements
// other .NET interfaces, including IEnumerable and IEnumerable<Char32>).
// All four functions are implemented by calling the equivalent functions
// on the underlying list.
interface IReadOnlyList<Char32> with
member this.Count: int = this.Length
member this.GetEnumerator(): IEnumerator = (this.List :> IEnumerable).GetEnumerator()
member this.GetEnumerator(): IEnumerator<Char32> = (this.List :> seq<Char32>).GetEnumerator()
member this.Item with get (index: int): Char32 = this.List.[index]
// The String32Replacement module provides functions that can help process
// strings from the Twitter API (or similar) using the entity objects it gives
// you (with offsets that point to certain characters in the string).
module String32Replacement =
// ISegment represents a certain part of a string (not provided), which
// should be replaced with a different string (which is provided). Since
// ISegment is an interface, the client code can implement it with a class
// that it already has defined, or create a new one.
type ISegment =
// The position in the string where the segment starts.
abstract member StartIndex: int
// The position in the string where the segment ends.
abstract member EndIndex: int
// What this part of the string should be replaced with. Since it's
// likely a different length than the original substring, it's
// important that we process these in reverse, to avoid changes in
// the indexes being used before the string replacement occurs.
abstract member ReplacementValue: String32
// Creates an ISegment that will simply remove a certain substring.
// This is a private function, used only in DisplayRange below.
let private Omit a b = {
new ISegment with
member __.StartIndex = a
member __.EndIndex = b
member __.ReplacementValue = String32.Empty
}
// Creates two ISegments that, when combined, limit the string to a
// certain range. This function uses a "seq" workflow to create a new
// IEnumerable<T> on the fly.
let DisplayRange (displayStartIndex: int, displayEndIndex: int) = seq {
// Creates an ISegment that removes the part before displayStartIndex.
// The "yield" is implicit because this sequence does not contain any
// explicit "yield" statements (equivalent to "yield return" in a C#
// enumerator function).
Omit 0 displayStartIndex
// Creates an ISegment that removes the part after displayEndIndex.
Omit displayEndIndex System.Int32.MaxValue
}
// Applies the changes in the given ISegment objects to the given string,
// returning the result as a sequence (IEnumerable) of strings. The list
// of ISegments must be in reverse order. This is another private function
// and is only used by the Replace function below.
let rec private Apply (remaining: ISegment list) (str: String32) = seq {
// Check the input list. This is a recursive function (denoted by the
// "rec" keyword); if the list is not empty, the function will process
// the first item and then call itself with the rest of the input and
// the rest of the list.
match remaining with
| [] ->
// There are no segments in the list; return the original string.
yield str
| e::tail ->
// Process the first segment in the list, which will be the one
// nearest the end of the string.
let startIndex = max 0 e.StartIndex
let endIndex = min str.Length e.EndIndex
// Add the string that comes after this section.
yield str.[endIndex ..]
// Add the characters in the replacement string.
yield e.ReplacementValue
// Process the string that comes before this section, using the
// remaining segments, and append the strings this produces to the
// output sequence.
yield! Apply tail str.[.. startIndex - 1]
}
// Applies the changes in the given ISegment objects to the given string,
// returning a new string. The list of ISegments does not have to be in
// order.
let Replace (segments: seq<ISegment>) (str: String32) =
// Take the input string.
str
// Pass it into the Apply function, along with:
|> Apply
// A version of the input list...
(segments
// ...sorted with the segments near the end coming first...
|> Seq.sortByDescending (fun x -> x.EndIndex)
// ...as an F# list
|> Seq.toList)
// Reverse the order of the resulting sequence of strings
|> Seq.rev
// Combine them into a single sequence of Char32 characters; this
// treats each string as a seq<Char32> and creates a new seq<Char32>
|> Seq.collect id
// Create a new String32 from the sequence
|> String32.FromEnumerable