Skip to content

Commit d433ce5

Browse files
cushonminborg
andcommitted
8369564: Provide a MemorySegment API to read strings with known lengths
Co-authored-by: Per Minborg <pminborg@openjdk.org> Reviewed-by: jvernee, mcimadamore
1 parent 556bddf commit d433ce5

10 files changed

Lines changed: 523 additions & 55 deletions

File tree

src/java.base/share/classes/java/lang/String.java

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2045,19 +2045,26 @@ public byte[] getBytes() {
20452045
return encode(Charset.defaultCharset(), coder(), value);
20462046
}
20472047

2048-
boolean bytesCompatible(Charset charset) {
2048+
boolean bytesCompatible(Charset charset, int srcIndex, int numChars) {
20492049
if (isLatin1()) {
20502050
if (charset == ISO_8859_1.INSTANCE) {
20512051
return true; // ok, same encoding
20522052
} else if (charset == UTF_8.INSTANCE || charset == US_ASCII.INSTANCE) {
2053-
return !StringCoding.hasNegatives(value, 0, value.length); // ok, if ASCII-compatible
2053+
return !StringCoding.hasNegatives(value, srcIndex, numChars); // ok, if ASCII-compatible
20542054
}
20552055
}
20562056
return false;
20572057
}
20582058

2059-
void copyToSegmentRaw(MemorySegment segment, long offset) {
2060-
MemorySegment.copy(value, 0, segment, ValueLayout.JAVA_BYTE, offset, value.length);
2059+
void copyToSegmentRaw(MemorySegment segment, long offset, int srcIndex, int srcLength) {
2060+
if (!isLatin1()) {
2061+
// This method is intended to be used together with bytesCompatible, which currently only supports
2062+
// latin1 strings. In the future, bytesCompatible could be updated to handle more cases, like
2063+
// UTF-16 strings (when the platform and charset endianness match, and the String doesn’t contain
2064+
// unpaired surrogates). If that happens, copyToSegmentRaw should also be updated.
2065+
throw new IllegalStateException("This string does not support copyToSegmentRaw");
2066+
}
2067+
MemorySegment.copy(value, srcIndex, segment, ValueLayout.JAVA_BYTE, offset, srcLength);
20612068
}
20622069

20632070
/**

src/java.base/share/classes/java/lang/System.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2331,13 +2331,13 @@ public String getLoaderNameID(ClassLoader loader) {
23312331
}
23322332

23332333
@Override
2334-
public void copyToSegmentRaw(String string, MemorySegment segment, long offset) {
2335-
string.copyToSegmentRaw(segment, offset);
2334+
public void copyToSegmentRaw(String string, MemorySegment segment, long offset, int srcIndex, int srcLength) {
2335+
string.copyToSegmentRaw(segment, offset, srcIndex, srcLength);
23362336
}
23372337

23382338
@Override
2339-
public boolean bytesCompatible(String string, Charset charset) {
2340-
return string.bytesCompatible(charset);
2339+
public boolean bytesCompatible(String string, Charset charset, int srcIndex, int numChars) {
2340+
return string.bytesCompatible(charset, srcIndex, numChars);
23412341
}
23422342
});
23432343
}

src/java.base/share/classes/java/lang/foreign/MemorySegment.java

Lines changed: 81 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1296,12 +1296,7 @@ MemorySegment reinterpret(long newSize,
12961296
* over the decoding process is required.
12971297
* <p>
12981298
* Getting a string from a segment with a known byte offset and
1299-
* known byte length can be done like so:
1300-
* {@snippet lang=java :
1301-
* byte[] bytes = new byte[length];
1302-
* MemorySegment.copy(segment, JAVA_BYTE, offset, bytes, 0, length);
1303-
* return new String(bytes, charset);
1304-
* }
1299+
* known byte length can be done using {@link #getString(long, Charset, long)}.
13051300
*
13061301
* @param offset offset in bytes (relative to this segment address) at which this
13071302
* access operation will occur
@@ -1328,6 +1323,40 @@ MemorySegment reinterpret(long newSize,
13281323
*/
13291324
String getString(long offset, Charset charset);
13301325

1326+
/**
1327+
* Reads a string from this segment at the given offset, using the provided length
1328+
* and charset.
1329+
* <p>
1330+
* This method always replaces malformed-input and unmappable-character
1331+
* sequences with this charset's default replacement string. The {@link
1332+
* java.nio.charset.CharsetDecoder} class should be used when more control
1333+
* over the decoding process is required.
1334+
* <p>
1335+
* If the string contains any {@code '\0'} characters, they will be read as well.
1336+
* This differs from {@link #getString(long, Charset)}, which will only read up
1337+
* to the first {@code '\0'}, resulting in truncation for string data that contains
1338+
* the {@code '\0'} character.
1339+
*
1340+
* @param offset offset in bytes (relative to this segment address) at which this
1341+
* access operation will occur
1342+
* @param charset the charset used to {@linkplain Charset#newDecoder() decode} the
1343+
* string bytes
1344+
* @param byteLength length, in bytes, of the region of memory to read and decode into
1345+
* a string
1346+
* @return a Java string constructed from the bytes read from the given starting
1347+
* address up to the given length
1348+
* @throws IllegalArgumentException if the size of the string is greater than the
1349+
* largest string supported by the platform
1350+
* @throws IndexOutOfBoundsException if {@code offset < 0}
1351+
* @throws IndexOutOfBoundsException if {@code offset > byteSize() - byteLength}
1352+
* @throws IllegalStateException if the {@linkplain #scope() scope} associated with
1353+
* this segment is not {@linkplain Scope#isAlive() alive}
1354+
* @throws WrongThreadException if this method is called from a thread {@code T},
1355+
* such that {@code isAccessibleBy(T) == false}
1356+
* @throws IllegalArgumentException if {@code byteLength < 0}
1357+
*/
1358+
String getString(long offset, Charset charset, long byteLength);
1359+
13311360
/**
13321361
* Writes the given string into this segment at the given offset, converting it to
13331362
* a null-terminated byte sequence using the {@linkplain StandardCharsets#UTF_8 UTF-8}
@@ -1366,7 +1395,8 @@ MemorySegment reinterpret(long newSize,
13661395
* If the given string contains any {@code '\0'} characters, they will be
13671396
* copied as well. This means that, depending on the method used to read
13681397
* the string, such as {@link MemorySegment#getString(long)}, the string
1369-
* will appear truncated when read again.
1398+
* will appear truncated when read again. The string can be read without
1399+
* truncation using {@link #getString(long, Charset, long)}.
13701400
*
13711401
* @param offset offset in bytes (relative to this segment address) at which this
13721402
* access operation will occur, the final address of this write
@@ -2606,6 +2636,50 @@ static void copy(Object srcArray, int srcIndex,
26062636
elementCount);
26072637
}
26082638

2639+
/**
2640+
* Copies the byte sequence of the given string encoded using the provided charset
2641+
* to the destination segment.
2642+
* <p>
2643+
* This method always replaces malformed-input and unmappable-character
2644+
* sequences with this charset's default replacement string. The {@link
2645+
* java.nio.charset.CharsetDecoder} class should be used when more control
2646+
* over the decoding process is required.
2647+
* <p>
2648+
* If the given string contains any {@code '\0'} characters, they will be
2649+
* copied as well. This means that, depending on the method used to read
2650+
* the string, such as {@link MemorySegment#getString(long)}, the string
2651+
* will appear truncated when read again. The string can be read without
2652+
* truncation using {@link #getString(long, Charset, long)}.
2653+
*
2654+
* @param src the Java string to be written into the destination segment
2655+
* @param dstEncoding the charset used to {@linkplain Charset#newEncoder() encode}
2656+
* the string bytes.
2657+
* @param srcIndex the starting character index of the source string
2658+
* @param dst the destination segment
2659+
* @param dstOffset the starting offset, in bytes, of the destination segment
2660+
* @param numChars the number of characters to be copied
2661+
* @throws IllegalStateException if the {@linkplain #scope() scope} associated with
2662+
* {@code dst} is not {@linkplain Scope#isAlive() alive}
2663+
* @throws WrongThreadException if this method is called from a thread {@code T},
2664+
* such that {@code dst.isAccessibleBy(T) == false}
2665+
* @throws IndexOutOfBoundsException if either {@code srcIndex}, {@code numChars}, or {@code dstOffset}
2666+
* are {@code < 0}
2667+
* @throws IndexOutOfBoundsException if {@code srcIndex > src.length() - numChars}
2668+
* @throws IllegalArgumentException if {@code dst} is {@linkplain #isReadOnly() read-only}
2669+
* @throws IndexOutOfBoundsException if {@code dstOffset > dstSegment.byteSize() - B} where {@code B} is the size,
2670+
* in bytes, of the substring of {@code src} encoded using the given charset
2671+
* @return the number of copied bytes.
2672+
*/
2673+
@ForceInline
2674+
static long copy(String src, Charset dstEncoding, int srcIndex, MemorySegment dst, long dstOffset, int numChars) {
2675+
Objects.requireNonNull(src);
2676+
Objects.requireNonNull(dstEncoding);
2677+
Objects.requireNonNull(dst);
2678+
Objects.checkFromIndexSize(srcIndex, numChars, src.length());
2679+
2680+
return AbstractMemorySegmentImpl.copy(src, dstEncoding, srcIndex, dst, dstOffset, numChars);
2681+
}
2682+
26092683
/**
26102684
* Finds and returns the relative offset, in bytes, of the first mismatch between the
26112685
* source and the destination segments. More specifically, the bytes at offset

src/java.base/share/classes/java/lang/foreign/SegmentAllocator.java

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ default MemorySegment allocateFrom(String str) {
111111
* If the given string contains any {@code '\0'} characters, they will be
112112
* copied as well. This means that, depending on the method used to read
113113
* the string, such as {@link MemorySegment#getString(long)}, the string
114-
* will appear truncated when read again.
114+
* will appear truncated when read again. The string can be read without
115+
* truncation using {@link MemorySegment#getString(long, Charset, long)}.
115116
*
116117
* @param str the Java string to be converted into a C string
117118
* @param charset the charset used to {@linkplain Charset#newEncoder() encode} the
@@ -137,10 +138,10 @@ default MemorySegment allocateFrom(String str, Charset charset) {
137138
int termCharSize = StringSupport.CharsetKind.of(charset).terminatorCharSize();
138139
MemorySegment segment;
139140
int length;
140-
if (StringSupport.bytesCompatible(str, charset)) {
141+
if (StringSupport.bytesCompatible(str, charset, 0, str.length())) {
141142
length = str.length();
142143
segment = allocateNoInit((long) length + termCharSize);
143-
StringSupport.copyToSegmentRaw(str, segment, 0);
144+
StringSupport.copyToSegmentRaw(str, segment, 0, 0, str.length());
144145
} else {
145146
byte[] bytes = str.getBytes(charset);
146147
length = bytes.length;
@@ -153,6 +154,53 @@ default MemorySegment allocateFrom(String str, Charset charset) {
153154
return segment;
154155
}
155156

157+
/**
158+
* Encodes a Java string using the provided charset and stores the resulting
159+
* byte array into a memory segment.
160+
* <p>
161+
* This method always replaces malformed-input and unmappable-character
162+
* sequences with this charset's default replacement byte array. The
163+
* {@link java.nio.charset.CharsetEncoder} class should be used when more
164+
* control over the encoding process is required.
165+
* <p>
166+
* If the given string contains any {@code '\0'} characters, they will be
167+
* copied as well. This means that, depending on the method used to read
168+
* the string, such as {@link MemorySegment#getString(long)}, the string
169+
* will appear truncated when read again. The string can be read without
170+
* truncation using {@link MemorySegment#getString(long, Charset, long)}.
171+
*
172+
* @param str the Java string to be encoded
173+
* @param charset the charset used to {@linkplain Charset#newEncoder() encode} the
174+
* string bytes
175+
* @param srcIndex the starting index of the source string
176+
* @param numChars the number of characters to be copied
177+
* @return a new native segment containing the encoded string
178+
* @throws IndexOutOfBoundsException if either {@code srcIndex} or {@code numChars} are {@code < 0}
179+
* @throws IndexOutOfBoundsException if {@code srcIndex > str.length() - numChars}
180+
*
181+
* @implSpec The default implementation for this method copies the contents of the
182+
* provided Java string into a new memory segment obtained by calling
183+
* {@code this.allocate(B)}, where {@code B} is the size, in bytes, of
184+
* the string encoded using the provided charset
185+
* (e.g. {@code str.getBytes(charset).length});
186+
*/
187+
@ForceInline
188+
default MemorySegment allocateFrom(String str, Charset charset, int srcIndex, int numChars) {
189+
Objects.requireNonNull(charset);
190+
Objects.requireNonNull(str);
191+
Objects.checkFromIndexSize(srcIndex, numChars, str.length());
192+
MemorySegment segment;
193+
if (StringSupport.bytesCompatible(str, charset, srcIndex, numChars)) {
194+
segment = allocateNoInit(numChars);
195+
StringSupport.copyToSegmentRaw(str, segment, 0, srcIndex, numChars);
196+
} else {
197+
byte[] bytes = str.substring(srcIndex, srcIndex + numChars).getBytes(charset);
198+
segment = allocateNoInit(bytes.length);
199+
MemorySegment.copy(bytes, 0, segment, ValueLayout.JAVA_BYTE, 0, bytes.length);
200+
}
201+
return segment;
202+
}
203+
156204
/**
157205
* {@return a new memory segment initialized with the provided byte value}
158206
* <p>

src/java.base/share/classes/jdk/internal/access/JavaLangAccess.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -634,10 +634,10 @@ StackWalker newStackWalkerInstance(Set<StackWalker.Option> options,
634634
/**
635635
* Copy the string bytes to an existing segment, avoiding intermediate copies.
636636
*/
637-
void copyToSegmentRaw(String string, MemorySegment segment, long offset);
637+
void copyToSegmentRaw(String string, MemorySegment segment, long offset, int srcIndex, int srcLength);
638638

639639
/**
640640
* Are the string bytes compatible with the given charset?
641641
*/
642-
boolean bytesCompatible(String string, Charset charset);
642+
boolean bytesCompatible(String string, Charset charset, int srcIndex, int numChars);
643643
}

src/java.base/share/classes/jdk/internal/foreign/AbstractMemorySegmentImpl.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,13 @@ public boolean equals(Object o) {
551551
unsafeGetOffset() == that.unsafeGetOffset();
552552
}
553553

554+
@Override
555+
public String getString(long offset, Charset charset, long byteLength) {
556+
Utils.checkNonNegativeArgument(byteLength, "byteLength");
557+
Objects.requireNonNull(charset);
558+
return StringSupport.read(this, offset, charset, byteLength);
559+
}
560+
554561
@Override
555562
public int hashCode() {
556563
return Objects.hash(
@@ -702,6 +709,16 @@ public static void copy(Object srcArray, int srcIndex,
702709
}
703710
}
704711

712+
@ForceInline
713+
public static long copy(String src, Charset dstEncoding, int srcIndex, MemorySegment dst, long dstOffset, int numChars) {
714+
Objects.requireNonNull(src);
715+
Objects.requireNonNull(dstEncoding);
716+
Objects.requireNonNull(dst);
717+
718+
AbstractMemorySegmentImpl destImpl = (AbstractMemorySegmentImpl)dst;
719+
return StringSupport.copyBytes(src, destImpl, dstEncoding, dstOffset, srcIndex, numChars);
720+
}
721+
705722
// accessors
706723

707724
@ForceInline

0 commit comments

Comments
 (0)