mirror of
https://anongit.gentoo.org/git/repo/gentoo.git
synced 2025-07-22 23:19:14 +02:00
Backport two supplemental patches from the devel branch so as to address some remaining defects affecting the revised read_mbchar() function. For further information, please refer to pitfall #65 at https://mywiki.wooledge.org/BashPitfalls, along with the commit messages of the patches themselves. The read builtin should now behave as well as it did in 4.4, and as well as it will in the upcoming 5.3-rc2 release. Whitelist the Contour and Ghostty terminal emulators for colour support. In the case that colour support is detected, refrain from defining any aliases for which either an alias or function is found already to exist by the same name. The motivation for making this change stems from a discussion at https://forums.gentoo.org/viewtopic-t-1170323.html. Define COLORTERM=1 in the environment of dircolors(1). The methods by which dircolors determines whether colour is supported are inferior to those of Gentoo. Indeed, at the point that dircolors is executed, it will already have been determined that colour is supported with some confidence. Defining COLORTERM=1 coerces dircolors into presuming so in turn, increasing the range of terminal emulators for which Gentoo is able to define LS_COLORS. Define _cwd as a local variable in genfun_set_win_title(). Consequently, _cwd is no longer a reserved variable name from the perspective of the user. Whitelist the Contour terminal emulator for XTWINOPS support. Signed-off-by: Kerin Millar <kfm@plushkava.net> Signed-off-by: Sam James <sam@gentoo.org>
286 lines
8.6 KiB
Diff
286 lines
8.6 KiB
Diff
From 2482e34ac12910f7e7436a968caf97ab96f8b582 Mon Sep 17 00:00:00 2001
|
|
From: Kerin Millar <kfm@plushkava.net>
|
|
Date: Sat, 26 Apr 2025 08:43:58 +0100
|
|
Subject: [PATCH 1/3] Backport fix for invalid continuation bytes being ignored
|
|
as delimiters
|
|
|
|
This is a partial backport of commit 772e7e760e8a098e4d8dee21cf11090be4757918
|
|
from the devel branch. It addresses an issue in read_mbchar() whereby
|
|
the read builtin can read past the delimiter character, provided that is
|
|
invoked with a multibyte character set in effect. Consider the following
|
|
test case.
|
|
|
|
$ LC_ALL=en_US.UTF-8
|
|
$ for i in {194..245}; do printf -v o %o "$i"; printf "\\$o\\n"; done |
|
|
while read -r; do declare -p REPLY; done
|
|
declare -- REPLY=$'\302\n\303\n\304\n\305\n\306\n\307\n\310\n\311\n\312\
|
|
n\313\n\314\n\315\n\316\n\317\n\320\n\321\n\322\n\323\n\324\n\325\n\326\
|
|
n\327\n\330\n\331\n\332\n\333\n\334\n\335\n\336\n\337\n\340\n\341\n\342\
|
|
n\343\n\344\n\345\n\346\n\347\n\350\n\351\n\352\n\353\n\354\n\355\n\356\
|
|
n\357\n\360\n\361\n\362\n\363\n\364\n\365'
|
|
|
|
The producing loop emits a sequence of bytes in the range 0xC2 - 0xF5.
|
|
Since each is terminated by a <newline> character, one would expect for
|
|
exactly 52 iterations of the consuming loop, with REPLY being assigned a
|
|
single byte each time. Instead, the input is read in its entirety. Why
|
|
is that, one may ask.
|
|
|
|
Given a legal UTF-8 byte sequence, any bytes whose values are between
|
|
0xC2 - 0xF4 are combinative in nature; they can only be followed by
|
|
between one and three bytes that are outside of that range.
|
|
|
|
0xC2 - 0xDF : First byte of a 2-byte code unit sequence
|
|
0xE0 - 0xEF : First byte of a 3-byte code unit sequence
|
|
0xF0 - 0xF4 : First byte of a 4-byte code unit sequence
|
|
|
|
As such, bash begins by reading the 0xC2 byte, for which mbrtowc(3)
|
|
returns -2, indicating an incomplete multibyte sequence. Next, the 0x0A
|
|
byte is read, for which mbrtowc(3) returns -1, indicating an invalid
|
|
multibyte sequence. At this point, bash ought to recognise the most
|
|
recently read byte as a delimiter. Instead, it continues reading the
|
|
input stream up until the delimiter that follows 0xF5, which is neither
|
|
a combining character nor legal in UTF-8 in any capacity.
|
|
|
|
This patch addresses the issue by introducing the zungetc() function,
|
|
which is used by read_mbchar() to push back the delimiter character that
|
|
transforms the sequence from an incomplete one to an invalid one. Said
|
|
character is then detected by the next invocation of the zread()
|
|
function, allowing for the decision to be made to return.
|
|
|
|
With this, the output of the test case amounts to 52 lines, as expected.
|
|
|
|
declare -- REPLY=$'\302'
|
|
declare -- REPLY=$'\303'
|
|
...
|
|
declare -- REPLY=$'\364'
|
|
declare -- REPLY=$'\365'
|
|
|
|
The issue affects all bash releases from 5.0 to 5.3-alpha. As of the
|
|
time of writing, it has not been addressed by any of the official
|
|
patchlevels, nor has 5.3 been released.
|
|
|
|
Link: https://pubs.opengroup.org/onlinepubs/9799919799.2024edition/utilities/read.html#tag_20_100_06
|
|
Link: https://mywiki.wooledge.org/BashPitfalls#IFS.3D_read_-r_-d_.27.27_filename
|
|
Link: https://lists.gnu.org/archive/html/bug-bash/2025-04/msg00068.html
|
|
Signed-off-by: Kerin Millar <kfm@plushkava.net>
|
|
---
|
|
builtins/read.def | 25 ++++++++++++----
|
|
externs.h | 1 +
|
|
lib/sh/zread.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++
|
|
3 files changed, 94 insertions(+), 6 deletions(-)
|
|
|
|
diff --git builtins/read.def builtins/read.def
|
|
index ddd91d32..53b4bd81 100644
|
|
--- builtins/read.def
|
|
+++ builtins/read.def
|
|
@@ -130,7 +130,7 @@ static void set_readline_timeout PARAMS((sh_timer *t, time_t, long));
|
|
#endif
|
|
static SHELL_VAR *bind_read_variable PARAMS((char *, char *, int));
|
|
#if defined (HANDLE_MULTIBYTE)
|
|
-static int read_mbchar PARAMS((int, char *, int, int, int));
|
|
+static int read_mbchar PARAMS((int, char *, int, int, int, int));
|
|
#endif
|
|
static void ttyrestore PARAMS((struct ttsave *));
|
|
|
|
@@ -806,7 +806,7 @@ add_char:
|
|
else
|
|
# endif
|
|
if (locale_utf8locale == 0 || ((c & 0x80) != 0))
|
|
- i += read_mbchar (fd, input_string, i, c, unbuffered_read);
|
|
+ i += read_mbchar (fd, input_string, i, c, delim, unbuffered_read);
|
|
}
|
|
#endif
|
|
|
|
@@ -1064,10 +1064,10 @@ bind_read_variable (name, value, flags)
|
|
|
|
#if defined (HANDLE_MULTIBYTE)
|
|
static int
|
|
-read_mbchar (fd, string, ind, ch, unbuffered)
|
|
+read_mbchar (fd, string, ind, ch, delim, unbuffered)
|
|
int fd;
|
|
char *string;
|
|
- int ind, ch, unbuffered;
|
|
+ int ind, ch, delim, unbuffered;
|
|
{
|
|
char mbchar[MB_LEN_MAX + 1];
|
|
int i, n, r;
|
|
@@ -1101,8 +1101,21 @@ read_mbchar (fd, string, ind, ch, unbuffered)
|
|
mbchar[i++] = c;
|
|
continue;
|
|
}
|
|
- else if (ret == (size_t)-1 || ret == (size_t)0 || ret > (size_t)0)
|
|
- break;
|
|
+ else if (ret == (size_t)-1)
|
|
+ {
|
|
+ /* If we read a delimiter character that makes this an invalid
|
|
+ multibyte character, we can't just add it to the input string
|
|
+ and treat it as a byte. We need to push it back so a subsequent
|
|
+ zread will pick it up. */
|
|
+ if (c == delim)
|
|
+ {
|
|
+ zungetc (c);
|
|
+ mbchar[--i] = '\0'; /* unget the delimiter */
|
|
+ }
|
|
+ break; /* invalid multibyte character */
|
|
+ }
|
|
+ else if (ret == (size_t)0 || ret > (size_t)0)
|
|
+ break; /* valid multibyte character */
|
|
}
|
|
|
|
mbchar_return:
|
|
diff --git externs.h externs.h
|
|
index 931dba9c..1b70a13b 100644
|
|
--- externs.h
|
|
+++ externs.h
|
|
@@ -536,6 +536,7 @@ extern ssize_t zreadintr PARAMS((int, char *, size_t));
|
|
extern ssize_t zreadc PARAMS((int, char *));
|
|
extern ssize_t zreadcintr PARAMS((int, char *));
|
|
extern ssize_t zreadn PARAMS((int, char *, size_t));
|
|
+extern int zungetc PARAMS((int));
|
|
extern void zreset PARAMS((void));
|
|
extern void zsyncfd PARAMS((int));
|
|
|
|
diff --git lib/sh/zread.c lib/sh/zread.c
|
|
index dafb7f60..7cfbb288 100644
|
|
--- lib/sh/zread.c
|
|
+++ lib/sh/zread.c
|
|
@@ -41,6 +41,10 @@ extern int errno;
|
|
# define ZBUFSIZ 4096
|
|
#endif
|
|
|
|
+#ifndef EOF
|
|
+# define EOF -1
|
|
+#endif
|
|
+
|
|
extern int executing_builtin;
|
|
|
|
extern void check_signals_and_traps (void);
|
|
@@ -48,6 +52,11 @@ extern void check_signals (void);
|
|
extern int signal_is_trapped (int);
|
|
extern int read_builtin_timeout (int);
|
|
|
|
+int zungetc (int);
|
|
+
|
|
+/* Provide one character of pushback whether we are using read or zread. */
|
|
+static int zpushedchar = -1;
|
|
+
|
|
/* Read LEN bytes from FD into BUF. Retry the read on EINTR. Any other
|
|
error causes the loop to break. */
|
|
ssize_t
|
|
@@ -59,6 +68,15 @@ zread (fd, buf, len)
|
|
ssize_t r;
|
|
|
|
check_signals (); /* check for signals before a blocking read */
|
|
+
|
|
+ /* If we pushed a char back, return it immediately */
|
|
+ if (zpushedchar != -1)
|
|
+ {
|
|
+ *buf = (unsigned char)zpushedchar;
|
|
+ zpushedchar = -1;
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
/* should generalize into a mechanism where different parts of the shell can
|
|
`register' timeouts and have them checked here. */
|
|
while (((r = read_builtin_timeout (fd)) < 0 || (r = read (fd, buf, len)) < 0) &&
|
|
@@ -95,6 +113,14 @@ zreadretry (fd, buf, len)
|
|
ssize_t r;
|
|
int nintr;
|
|
|
|
+ /* If we pushed a char back, return it immediately */
|
|
+ if (zpushedchar != -1)
|
|
+ {
|
|
+ *buf = (unsigned char)zpushedchar;
|
|
+ zpushedchar = -1;
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
for (nintr = 0; ; )
|
|
{
|
|
r = read (fd, buf, len);
|
|
@@ -118,6 +144,15 @@ zreadintr (fd, buf, len)
|
|
size_t len;
|
|
{
|
|
check_signals ();
|
|
+
|
|
+ /* If we pushed a char back, return it immediately */
|
|
+ if (zpushedchar != -1)
|
|
+ {
|
|
+ *buf = (unsigned char)zpushedchar;
|
|
+ zpushedchar = -1;
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
return (read (fd, buf, len));
|
|
}
|
|
|
|
@@ -135,6 +170,14 @@ zreadc (fd, cp)
|
|
{
|
|
ssize_t nr;
|
|
|
|
+ /* If we pushed a char back, return it immediately */
|
|
+ if (zpushedchar != -1 && cp)
|
|
+ {
|
|
+ *cp = (unsigned char)zpushedchar;
|
|
+ zpushedchar = -1;
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
if (lind == lused || lused == 0)
|
|
{
|
|
nr = zread (fd, lbuf, sizeof (lbuf));
|
|
@@ -160,6 +203,14 @@ zreadcintr (fd, cp)
|
|
{
|
|
ssize_t nr;
|
|
|
|
+ /* If we pushed a char back, return it immediately */
|
|
+ if (zpushedchar != -1 && cp)
|
|
+ {
|
|
+ *cp = (unsigned char)zpushedchar;
|
|
+ zpushedchar = -1;
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
if (lind == lused || lused == 0)
|
|
{
|
|
nr = zreadintr (fd, lbuf, sizeof (lbuf));
|
|
@@ -186,6 +237,13 @@ zreadn (fd, cp, len)
|
|
{
|
|
ssize_t nr;
|
|
|
|
+ if (zpushedchar != -1 && cp)
|
|
+ {
|
|
+ *cp = zpushedchar;
|
|
+ zpushedchar = -1;
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
if (lind == lused || lused == 0)
|
|
{
|
|
if (len > sizeof (lbuf))
|
|
@@ -204,6 +262,22 @@ zreadn (fd, cp, len)
|
|
return 1;
|
|
}
|
|
|
|
+int
|
|
+zungetc (c)
|
|
+ int c;
|
|
+{
|
|
+ if (zpushedchar == -1)
|
|
+ {
|
|
+ zpushedchar = c;
|
|
+ return c;
|
|
+ }
|
|
+
|
|
+ if (c == EOF || lind == 0)
|
|
+ return (EOF);
|
|
+ lbuf[--lind] = c; /* XXX */
|
|
+ return c;
|
|
+}
|
|
+
|
|
void
|
|
zreset ()
|
|
{
|
|
--
|
|
2.49.0
|
|
|