isaacschemm: A cartoon of myself as a snail (snail8)
[personal profile] isaacschemm posting in [community profile] snailsharp
// UTF-32 is a Unicode encoding that uses exactly four bytes for each
// codepoint. Unlike UTF-8 (used widely on the Web) or UTF-16 (used in Java
// and .NET), each codepoint / character takes up the same number of bytes,
// making it much easier to do string processing (counting, substrings) based
// on character count.

// One common application is applying styling or custom behavior to Twitter
// posts based on the data provided by the API:
// https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/entities
// Copyright 2022 Isaac Schemm

// Permission is hereby granted, free of charge, to any person obtaining a copy of this
// software and associated documentation files (the "Software"), to deal in the Software
// without restriction, including without limitation the rights to use, copy, modify,
// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

namespace ISchemm.UTF32

open System
open System.Collections
open System.Collections.Generic
open System.Runtime.InteropServices
open System.Text
open Microsoft.FSharp.NativeInterop

// The type Char32 is used to store a single Unicode character. Like any F#
// record, structural equality and comparison is implemented by the compiler.
// By compiling the Char32 record as a struct (instead of a class), we can
// allow client code (particularily in C#) to use the Char32 type in raw
// ("unsafe") memory pointers. The size of the struct will be 4 bytes, the
// same as the type of its one underlying field (a 32-bit integer).
[<Struct>]
type Char32 = {
    // Because this is an F# record field, "value" will be required in the
    // constructor and read-only once the object is created; instead of
    // replacing this field, the user should replace the Char32 itself with
    // another Char32. The goal is to make this type as analogous to built-in
    // primitive types (char, int) as possible.
    Value: int32
} with
    // Keeping in mind that the client code might use pointers to access this
    // data, it seems appropriate that casting a Char32* to an int* should
    // reveal the Unicode codepoints themselves, which means the byte ordering
    // used should match the system's architecture.
    static member Encoding = new UTF32Encoding(not BitConverter.IsLittleEndian, false)

    // Provides an alternative to simply calling the constructor that may
    // convey intention more clearly.
    static member FromInt32 v = { Value = v }

    // Like the built-in char type, a Char32's string representation should be
    // a string containing only the character itself. (Because .NET strings
    // are UTF-16, this string may contain two bytes.)
    override this.ToString () =
        this.Value
        |> BitConverter.GetBytes
        |> Char32.Encoding.GetString

// Char32Array provides a set of static functions for converting to and from
// arrays of Char32 objects.
module Char32Array =
    // Converts from a byte array containing UTF-32 encoded data (in the
    // native byte order) to a Char32 array.
    let FromByteArray (bArr: byte[]): Char32[] =
        // In UTF-32, all codepoints take up four bytes, so the input array's
        // length must be a multiple of four.
        if bArr.Length % sizeof<Char32> <> 0 then
            raise (FormatException $"Length of array must be divisible by {sizeof<Char32>}")

        if bArr.Length = 0 then
            // An empty array.
            [||]
        else
            // Create an array to hold the copied data.
            let cArr = Array.zeroCreate<Char32> (bArr.Length / sizeof<Char32>)

            // Unmanaged memory access is used to copy the raw bytes into the
            // Char32 array. The "fixed" statement is used to define a
            // pointer (cPtr) to the start of cArr, which the garbage
            // collector cannot move until cPtr goes out of scope. (The "in"
            // keyword comes from ML "let ... in ... end", while "use" is
            // roughly equivalent to C# "using".)
            use cPtr = fixed cArr in Marshal.Copy (bArr, 0, NativePtr.toNativeInt cPtr, bArr.Length)
            cArr

    // Converts a .NET (UTF-16) string to a Char32 array, by converting it to
    // a byte array then calling FromByteArray.
    let FromString (str: string): Char32[] =
        str
        |> Char32.Encoding.GetBytes
        |> FromByteArray

    // Converts a Char32 array to a byte array.
    let GetByteArray (cArr: Char32[]): byte[] =
        if cArr.Length = 0 then
            // An empty array.
            [||]
        else
            // Create an array to hold the copied data.
            let bArr = Array.zeroCreate<byte> (cArr.Length * sizeof<Char32>)
            // Copy the data in the same way as above.
            use cPtr = fixed cArr in Marshal.Copy (NativePtr.toNativeInt cPtr, bArr, 0, bArr.Length)
            bArr

    // Converts a Char32 array to a .NET (UTF-16) string, by calling
    // GetByteArray then converting the result to a string.
    let GetString (array: Char32[]): string =
        array
        |> GetByteArray
        |> Char32.Encoding.GetString

// The String32 type is an immutable string of Char32 objects, intended to
// mirror native .NET strings (at least in the functionality that's been
// implemented here.) The version of this code on NuGet made this type a
// struct, but I don't think this is necessary; unlike Char32, its sole field
// is not a value type.
type String32 = {
    // Using an F# list inside an F# record provides structural equality and
    // comparison and also ensures that the object is immutable.
    List: Char32 list
} with
    // A static method to convert an F# list of Char32 objects to a String32.
    static member FromList (list: Char32 list) = { List = list }

    // A static method to convert any IEnumerable<Char32> (such as a .NET list
    // or array) to a String32. Useful from C# or VB.NET; having to use F#'s
    // ListModule from these languages would not be immediately obvious.
    static member FromEnumerable (src: seq<Char32>) =
        src
        |> List.ofSeq
        |> String32.FromList

    // This is here to mirror FromEnumerable; the String32 itself is returned,
    // since String32 implements IEnumerable<Char32> (see below).
    static member ToEnumerable (src: String32) =
        src :> seq<Char32>

    // A static method to convert a .NET string to a String32, by using
    // Char32Array.FromString and then FromEnumerable.
    static member FromString (str: string) =
        str
        |> Char32Array.FromString
        |> String32.FromEnumerable

    // A static method to convert a String32 to a .NET string, by copying the
    // underlying list to an array and then using Char32Array.GetString.
    static member ToString (src: String32) =
        src.List
        |> Array.ofList
        |> Char32Array.GetString

    // An empty String32.
    static member Empty = { List = [] }

    // Gets the length of the String32 (in characters / codepoints), which is
    // simply the length of the underlying list of Char32 objects.
    member this.Length = List.length this.List

    // Allows indexing into the String32.
    member this.Item index = this.List.[index]

    // Allows taking substrings of the String32 from F# using slice syntax.
    // Implemented by simply taking a slice of the underlying list.
    member this.GetSlice (startIndex, endIndex) =
        this.List.GetSlice (startIndex, endIndex)
        |> String32.FromList

    // A more traditional set of substring methods, implemented by using the
    // F# indexing and slice syntax that Item and GetSlice implement.
    member this.Substring (startIndex) = this.[startIndex..]
    member this.Substring (startIndex, length) = this.[startIndex..startIndex + length - 1]

    // The string representation for a String32, which is the same string,
    // just converted to a .NET (UTF-16) string.
    override this.ToString () = String32.ToString this

    // String32 implements IReadOnlyList<Char32> (which itself implements
    // other .NET interfaces, including IEnumerable and IEnumerable<Char32>).
    // All four functions are implemented by calling the equivalent functions
    // on the underlying list.
    interface IReadOnlyList<Char32> with
        member this.Count: int = this.Length
        member this.GetEnumerator(): IEnumerator = (this.List :> IEnumerable).GetEnumerator()
        member this.GetEnumerator(): IEnumerator<Char32> = (this.List :> seq<Char32>).GetEnumerator()
        member this.Item with get (index: int): Char32 = this.List.[index]

// The String32Replacement module provides functions that can help process
// strings from the Twitter API (or similar) using the entity objects it gives
// you (with offsets that point to certain characters in the string).
module String32Replacement =
    // ISegment represents a certain part of a string (not provided), which
    // should be replaced with a different string (which is provided). Since
    // ISegment is an interface, the client code can implement it with a class
    // that it already has defined, or create a new one.
    type ISegment =
        // The position in the string where the segment starts.
        abstract member StartIndex: int

        // The position in the string where the segment ends.
        abstract member EndIndex: int

        // What this part of the string should be replaced with. Since it's
        // likely a different length than the original substring, it's
        // important that we process these in reverse, to avoid changes in
        // the indexes being used before the string replacement occurs.
        abstract member ReplacementValue: String32

    // Creates an ISegment that will simply remove a certain substring.
    // This is a private function, used only in DisplayRange below.
    let private Omit a b = {
        new ISegment with
            member __.StartIndex = a
            member __.EndIndex = b
            member __.ReplacementValue = String32.Empty
    }

    // Creates two ISegments that, when combined, limit the string to a
    // certain range. This function uses a "seq" workflow to create a new
    // IEnumerable<T> on the fly.
    let DisplayRange (displayStartIndex: int, displayEndIndex: int) = seq {
        // Creates an ISegment that removes the part before displayStartIndex.
        // The "yield" is implicit because this sequence does not contain any
        // explicit "yield" statements (equivalent to "yield return" in a C#
        // enumerator function).
        Omit 0 displayStartIndex
        // Creates an ISegment that removes the part after displayEndIndex.
        Omit displayEndIndex System.Int32.MaxValue
    }

    // Applies the changes in the given ISegment objects to the given string,
    // returning the result as a sequence (IEnumerable) of strings. The list
    // of ISegments must be in reverse order. This is another private function
    // and is only used by the Replace function below.
    let rec private Apply (remaining: ISegment list) (str: String32) = seq {
        // Check the input list. This is a recursive function (denoted by the
        // "rec" keyword); if the list is not empty, the function will process
        // the first item and then call itself with the rest of the input and
        // the rest of the list.
        match remaining with
        | [] ->
            // There are no segments in the list; return the original string.
            yield str
        | e::tail ->
            // Process the first segment in the list, which will be the one
            // nearest the end of the string.
            let startIndex = max 0 e.StartIndex
            let endIndex = min str.Length e.EndIndex

            // Add the string that comes after this section.
            yield str.[endIndex ..]
            // Add the characters in the replacement string.
            yield e.ReplacementValue
            // Process the string that comes before this section, using the
            // remaining segments, and append the strings this produces to the
            // output sequence.
            yield! Apply tail str.[.. startIndex - 1]
    }

    // Applies the changes in the given ISegment objects to the given string,
    // returning a new string. The list of ISegments does not have to be in
    // order.
    let Replace (segments: seq<ISegment>) (str: String32) =
        // Take the input string.
        str
        // Pass it into the Apply function, along with:
        |> Apply
            // A version of the input list...
            (segments
            // ...sorted with the segments near the end coming first...
            |> Seq.sortByDescending (fun x -> x.EndIndex)
            // ...as an F# list
            |> Seq.toList)
        // Reverse the order of the resulting sequence of strings
        |> Seq.rev
        // Combine them into a single sequence of Char32 characters; this
        // treats each string as a seq<Char32> and creates a new seq<Char32>
        |> Seq.collect id
        // Create a new String32 from the sequence
        |> String32.FromEnumerable
Tags:
This account has disabled anonymous posting.
If you don't have an account you can create one now.
HTML doesn't work in the subject.
More info about formatting

Snail#

A programming blog where the gimmick is that I pretend to be a snail.

Expand Cut Tags

No cut tags

Style Credit

Page generated Jul. 6th, 2025 07:47 pm
Powered by Dreamwidth Studios