mirror of
https://github.com/vim/vim
synced 2025-03-16 14:57:52 +01:00
Problem: vim_strnchr() is strange and unnecessary (after v9.1.1009) Solution: Remove vim_strnchr() and use memchr() instead. Also remove a comment referencing an #if that is no longer present. vim_strnchr() is strange in several ways: - It's named like vim_strchr(), but unlike vim_strchr() it doesn't support finding a multibyte char. - Its logic is similar to vim_strbyte(), but unlike vim_strbyte() it uses char instead of char_u. - It takes a pointer as its size argument, which isn't convenient for all its callers. - It allows embedded NULs, unlike other "strn*" functions which stop when encountering a NUL byte. In comparison, memchr() also allows embedded NULs, and it converts bytes in the string to (unsigned char). closes: #16579 Signed-off-by: zeertzjq <zeertzjq@outlook.com> Signed-off-by: Christian Brabandt <cb@256bit.org>
483 lines
14 KiB
C
483 lines
14 KiB
C
/* vi:set ts=8 sts=4 sw=4 noet:
|
|
*
|
|
* VIM - Vi IMproved by Bram Moolenaar
|
|
*
|
|
* Do ":help uganda" in Vim to read copying and usage conditions.
|
|
* Do ":help credits" in Vim to see a list of people who contributed.
|
|
* See README.txt for an overview of the Vim source code.
|
|
*/
|
|
|
|
#include "vim.h"
|
|
|
|
#define LN_MAX_BUFS 8
|
|
#define LN_DECISION_MAX 255 // pow(2, LN_MAX_BUFS(8)) - 1 = 255
|
|
|
|
// struct for running the diff linematch algorithm
|
|
typedef struct diffcmppath_S diffcmppath_T;
|
|
struct diffcmppath_S
|
|
{
|
|
// to keep track of the total score of this path
|
|
int df_lev_score;
|
|
size_t df_path_n; // current index of this path
|
|
int df_choice_mem[LN_DECISION_MAX + 1];
|
|
int df_choice[LN_DECISION_MAX];
|
|
// to keep track of this path traveled
|
|
diffcmppath_T *df_decision[LN_DECISION_MAX];
|
|
size_t df_optimal_choice;
|
|
};
|
|
|
|
static int matching_chars(const mmfile_t *m1, const mmfile_t *m2);
|
|
static size_t unwrap_indexes(const int *values, const int *diff_len, const size_t ndiffs);
|
|
static size_t test_charmatch_paths(diffcmppath_T *node, int lastdecision);
|
|
|
|
static size_t
|
|
line_len(const mmfile_t *m)
|
|
{
|
|
char *s = m->ptr;
|
|
char *end;
|
|
|
|
end = memchr(s, '\n', (size_t)m->size);
|
|
return end ? (size_t)(end - s) : (size_t)m->size;
|
|
}
|
|
|
|
#define MATCH_CHAR_MAX_LEN 800
|
|
|
|
/// Same as matching_chars but ignore whitespace
|
|
///
|
|
/// @param s1
|
|
/// @param s2
|
|
static int
|
|
matching_chars_iwhite(const mmfile_t *s1, const mmfile_t *s2)
|
|
{
|
|
// the newly processed strings that will be compared
|
|
// delete the white space characters
|
|
mmfile_t sp[2];
|
|
char p[2][MATCH_CHAR_MAX_LEN];
|
|
|
|
for (int k = 0; k < 2; k++)
|
|
{
|
|
const mmfile_t *s = k == 0 ? s1 : s2;
|
|
size_t pi = 0;
|
|
size_t slen = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s));
|
|
|
|
for (size_t i = 0; i <= slen; i++)
|
|
{
|
|
char e = s->ptr[i];
|
|
|
|
if (e != ' ' && e != '\t')
|
|
{
|
|
p[k][pi] = e;
|
|
pi++;
|
|
}
|
|
}
|
|
|
|
sp[k].ptr = p[k];
|
|
sp[k].size = (int)pi;
|
|
}
|
|
|
|
return matching_chars(&sp[0], &sp[1]);
|
|
}
|
|
|
|
/// Return matching characters between "s1" and "s2" whilst respecting sequence
|
|
/// order.
|
|
/// Consider the case of two strings 'AAACCC' and 'CCCAAA', the
|
|
/// return value from this function will be 3, either to match
|
|
/// the 3 C's, or the 3 A's.
|
|
///
|
|
/// Examples:
|
|
/// matching_chars("aabc", "acba") -> 2 // 'a' and 'b' in common
|
|
/// matching_chars("123hello567", "he123ll567o") -> 8 // '123', 'll' and '567' in common
|
|
/// matching_chars("abcdefg", "gfedcba") -> 1 // all characters in common,
|
|
/// // but only at most 1 in sequence
|
|
///
|
|
/// @param m1
|
|
/// @param m2
|
|
static int
|
|
matching_chars(const mmfile_t *m1, const mmfile_t *m2)
|
|
{
|
|
size_t s1len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m1));
|
|
size_t s2len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m2));
|
|
char *s1 = m1->ptr;
|
|
char *s2 = m2->ptr;
|
|
int matrix[2][MATCH_CHAR_MAX_LEN] = { 0 };
|
|
int icur = 1; // save space by storing only two rows for i axis
|
|
|
|
for (size_t i = 0; i < s1len; i++)
|
|
{
|
|
icur = (icur == 1 ? 0 : 1);
|
|
int *e1 = matrix[icur];
|
|
int *e2 = matrix[!icur];
|
|
|
|
for (size_t j = 0; j < s2len; j++)
|
|
{
|
|
// skip char in s1
|
|
if (e2[j + 1] > e1[j + 1])
|
|
e1[j + 1] = e2[j + 1];
|
|
// skip char in s2
|
|
if (e1[j] > e1[j + 1])
|
|
e1[j + 1] = e1[j];
|
|
// compare char in s1 and s2
|
|
if ((s1[i] == s2[j]) && (e2[j] + 1) > e1[j + 1])
|
|
e1[j + 1] = e2[j] + 1;
|
|
}
|
|
}
|
|
|
|
return matrix[icur][s2len];
|
|
}
|
|
|
|
/// count the matching characters between a variable number of strings "sp"
|
|
/// mark the strings that have already been compared to extract them later
|
|
/// without re-running the character match counting.
|
|
/// @param sp
|
|
/// @param fomvals
|
|
/// @param n
|
|
static int
|
|
count_n_matched_chars(mmfile_t **sp, const size_t n, int iwhite)
|
|
{
|
|
int matched_chars = 0;
|
|
int matched = 0;
|
|
|
|
for (size_t i = 0; i < n; i++)
|
|
{
|
|
for (size_t j = i + 1; j < n; j++)
|
|
{
|
|
if (sp[i]->ptr != NULL && sp[j]->ptr != NULL)
|
|
{
|
|
matched++;
|
|
// TODO(lewis6991): handle whitespace ignoring higher up in the
|
|
// stack
|
|
matched_chars += iwhite ? matching_chars_iwhite(sp[i], sp[j])
|
|
: matching_chars(sp[i], sp[j]);
|
|
}
|
|
}
|
|
}
|
|
|
|
// prioritize a match of 3 (or more lines) equally to a match of 2 lines
|
|
if (matched >= 2)
|
|
{
|
|
matched_chars *= 2;
|
|
matched_chars /= matched;
|
|
}
|
|
|
|
return matched_chars;
|
|
}
|
|
|
|
static mmfile_t
|
|
fastforward_buf_to_lnum(mmfile_t s, linenr_T lnum)
|
|
{
|
|
for (int i = 0; i < lnum - 1; i++)
|
|
{
|
|
char *line_end;
|
|
|
|
line_end = memchr(s.ptr, '\n', (size_t)s.size);
|
|
s.size = line_end ? (int)(s.size - (line_end - s.ptr)) : 0;
|
|
s.ptr = line_end;
|
|
if (!s.ptr)
|
|
break;
|
|
s.ptr++;
|
|
s.size--;
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
/// try all the different ways to compare these lines and use the one that
|
|
/// results in the most matching characters
|
|
/// @param df_iters
|
|
/// @param paths
|
|
/// @param npaths
|
|
/// @param path_idx
|
|
/// @param choice
|
|
/// @param diffcmppath
|
|
/// @param diff_len
|
|
/// @param ndiffs
|
|
/// @param diff_blk
|
|
static void
|
|
try_possible_paths(
|
|
const int *df_iters,
|
|
const size_t *paths,
|
|
const int npaths,
|
|
const int path_idx,
|
|
int *choice,
|
|
diffcmppath_T *diffcmppath,
|
|
const int *diff_len,
|
|
const size_t ndiffs,
|
|
const mmfile_t **diff_blk,
|
|
int iwhite)
|
|
{
|
|
if (path_idx == npaths)
|
|
{
|
|
if ((*choice) > 0)
|
|
{
|
|
int from_vals[LN_MAX_BUFS] = { 0 };
|
|
const int *to_vals = df_iters;
|
|
|
|
mmfile_t mm[LN_MAX_BUFS]; // stack memory for current_lines
|
|
mmfile_t *current_lines[LN_MAX_BUFS];
|
|
for (size_t k = 0; k < ndiffs; k++)
|
|
{
|
|
from_vals[k] = df_iters[k];
|
|
// get the index at all of the places
|
|
if ((*choice) & (1 << k))
|
|
{
|
|
from_vals[k]--;
|
|
mm[k] = fastforward_buf_to_lnum(*diff_blk[k], df_iters[k]);
|
|
}
|
|
else
|
|
CLEAR_FIELD(mm[k]);
|
|
current_lines[k] = &mm[k];
|
|
}
|
|
size_t unwrapped_idx_from = unwrap_indexes(from_vals, diff_len, ndiffs);
|
|
size_t unwrapped_idx_to = unwrap_indexes(to_vals, diff_len, ndiffs);
|
|
int matched_chars = count_n_matched_chars(current_lines, ndiffs, iwhite);
|
|
int score = diffcmppath[unwrapped_idx_from].df_lev_score + matched_chars;
|
|
|
|
if (score > diffcmppath[unwrapped_idx_to].df_lev_score)
|
|
{
|
|
diffcmppath[unwrapped_idx_to].df_path_n = 1;
|
|
diffcmppath[unwrapped_idx_to].df_decision[0] =
|
|
&diffcmppath[unwrapped_idx_from];
|
|
diffcmppath[unwrapped_idx_to].df_choice[0] = *choice;
|
|
diffcmppath[unwrapped_idx_to].df_lev_score = score;
|
|
}
|
|
else if (score == diffcmppath[unwrapped_idx_to].df_lev_score)
|
|
{
|
|
size_t k = diffcmppath[unwrapped_idx_to].df_path_n++;
|
|
diffcmppath[unwrapped_idx_to].df_decision[k] =
|
|
&diffcmppath[unwrapped_idx_from];
|
|
diffcmppath[unwrapped_idx_to].df_choice[k] = *choice;
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
size_t bit_place = paths[path_idx];
|
|
*(choice) |= (1 << bit_place); // set it to 1
|
|
try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice,
|
|
diffcmppath, diff_len, ndiffs, diff_blk, iwhite);
|
|
*(choice) &= ~(1 << bit_place); // set it to 0
|
|
try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice,
|
|
diffcmppath, diff_len, ndiffs, diff_blk, iwhite);
|
|
}
|
|
|
|
/// unwrap indexes to access n dimensional tensor
|
|
/// @param values
|
|
/// @param diff_len
|
|
/// @param ndiffs
|
|
static size_t
|
|
unwrap_indexes(const int *values, const int *diff_len, const size_t ndiffs)
|
|
{
|
|
size_t num_unwrap_scalar = 1;
|
|
|
|
for (size_t k = 0; k < ndiffs; k++)
|
|
num_unwrap_scalar *= (size_t)diff_len[k] + 1;
|
|
|
|
size_t path_idx = 0;
|
|
for (size_t k = 0; k < ndiffs; k++)
|
|
{
|
|
num_unwrap_scalar /= (size_t)diff_len[k] + 1;
|
|
|
|
int n = values[k];
|
|
path_idx += num_unwrap_scalar * (size_t)n;
|
|
}
|
|
|
|
return path_idx;
|
|
}
|
|
|
|
/// populate the values of the linematch algorithm tensor, and find the best
|
|
/// decision for how to compare the relevant lines from each of the buffers at
|
|
/// each point in the tensor
|
|
/// @param df_iters
|
|
/// @param ch_dim
|
|
/// @param diffcmppath
|
|
/// @param diff_len
|
|
/// @param ndiffs
|
|
/// @param diff_blk
|
|
static void
|
|
populate_tensor(
|
|
int *df_iters,
|
|
const size_t ch_dim,
|
|
diffcmppath_T *diffcmppath,
|
|
const int *diff_len,
|
|
const size_t ndiffs,
|
|
const mmfile_t **diff_blk,
|
|
int iwhite)
|
|
{
|
|
if (ch_dim == ndiffs)
|
|
{
|
|
int npaths = 0;
|
|
size_t paths[LN_MAX_BUFS];
|
|
|
|
for (size_t j = 0; j < ndiffs; j++)
|
|
{
|
|
if (df_iters[j] > 0)
|
|
{
|
|
paths[npaths] = j;
|
|
npaths++;
|
|
}
|
|
}
|
|
|
|
int choice = 0;
|
|
size_t unwrapper_idx_to = unwrap_indexes(df_iters, diff_len, ndiffs);
|
|
|
|
diffcmppath[unwrapper_idx_to].df_lev_score = -1;
|
|
try_possible_paths(df_iters, paths, npaths, 0, &choice, diffcmppath,
|
|
diff_len, ndiffs, diff_blk, iwhite);
|
|
return;
|
|
}
|
|
|
|
for (int i = 0; i <= diff_len[ch_dim]; i++)
|
|
{
|
|
df_iters[ch_dim] = i;
|
|
populate_tensor(df_iters, ch_dim + 1, diffcmppath, diff_len,
|
|
ndiffs, diff_blk, iwhite);
|
|
}
|
|
}
|
|
|
|
/// algorithm to find an optimal alignment of lines of a diff block with 2 or
|
|
/// more files. The algorithm is generalized to work for any number of files
|
|
/// which corresponds to another dimension added to the tensor used in the
|
|
/// algorithm
|
|
///
|
|
/// for questions and information about the linematch algorithm please contact
|
|
/// Jonathon White (jonathonwhite@protonmail.com)
|
|
///
|
|
/// for explanation, a summary of the algorithm in 3 dimensions (3 files
|
|
/// compared) follows
|
|
///
|
|
/// The 3d case (for 3 buffers) of the algorithm implemented when diffopt
|
|
/// 'linematch' is enabled. The algorithm constructs a 3d tensor to
|
|
/// compare a diff between 3 buffers. The dimensions of the tensor are
|
|
/// the length of the diff in each buffer plus 1 A path is constructed by
|
|
/// moving from one edge of the cube/3d tensor to the opposite edge.
|
|
/// Motions from one cell of the cube to the next represent decisions. In
|
|
/// a 3d cube, there are a total of 7 decisions that can be made,
|
|
/// represented by the enum df_path3_choice which is defined in
|
|
/// buffer_defs.h a comparison of buffer 0 and 1 represents a motion
|
|
/// toward the opposite edge of the cube with components along the 0 and
|
|
/// 1 axes. a comparison of buffer 0, 1, and 2 represents a motion
|
|
/// toward the opposite edge of the cube with components along the 0, 1,
|
|
/// and 2 axes. A skip of buffer 0 represents a motion along only the 0
|
|
/// axis. For each action, a point value is awarded, and the path is
|
|
/// saved for reference later, if it is found to have been the optimal
|
|
/// path. The optimal path has the highest score. The score is
|
|
/// calculated as the summation of the total characters matching between
|
|
/// all of the lines which were compared. The structure of the algorithm
|
|
/// is that of a dynamic programming problem. We can calculate a point
|
|
/// i,j,k in the cube as a function of i-1, j-1, and k-1. To find the
|
|
/// score and path at point i,j,k, we must determine which path we want
|
|
/// to use, this is done by looking at the possibilities and choosing
|
|
/// the one which results in the local highest score. The total highest
|
|
/// scored path is, then in the end represented by the cell in the
|
|
/// opposite corner from the start location. The entire algorithm
|
|
/// consists of populating the 3d cube with the optimal paths from which
|
|
/// it may have came.
|
|
///
|
|
/// Optimizations:
|
|
/// As the function to calculate the cell of a tensor at point i,j,k is a
|
|
/// function of the cells at i-1, j-1, k-1, the whole tensor doesn't need
|
|
/// to be stored in memory at once. In the case of the 3d cube, only two
|
|
/// slices (along k and j axis) are stored in memory. For the 2d matrix
|
|
/// (for 2 files), only two rows are stored at a time. The next/previous
|
|
/// slice (or row) is always calculated from the other, and they alternate
|
|
/// at each iteration.
|
|
/// In the 3d case, 3 arrays are populated to memorize the score (matched
|
|
/// characters) of the 3 buffers, so a redundant calculation of the
|
|
/// scores does not occur
|
|
/// @param diff_blk
|
|
/// @param diff_len
|
|
/// @param ndiffs
|
|
/// @param [out] [allocated] decisions
|
|
/// @return the length of decisions
|
|
size_t
|
|
linematch_nbuffers(
|
|
const mmfile_t **diff_blk,
|
|
const int *diff_len,
|
|
const size_t ndiffs,
|
|
int **decisions,
|
|
int iwhite)
|
|
{
|
|
assert(ndiffs <= LN_MAX_BUFS);
|
|
|
|
size_t memsize = 1;
|
|
size_t memsize_decisions = 0;
|
|
for (size_t i = 0; i < ndiffs; i++)
|
|
{
|
|
assert(diff_len[i] >= 0);
|
|
memsize *= (size_t)(diff_len[i] + 1);
|
|
memsize_decisions += (size_t)diff_len[i];
|
|
}
|
|
|
|
// create the flattened path matrix
|
|
diffcmppath_T *diffcmppath = lalloc(sizeof(diffcmppath_T) * memsize, TRUE);
|
|
// allocate memory here
|
|
for (size_t i = 0; i < memsize; i++)
|
|
{
|
|
diffcmppath[i].df_lev_score = 0;
|
|
diffcmppath[i].df_path_n = 0;
|
|
for (size_t j = 0; j < (size_t)pow(2, (double)ndiffs); j++)
|
|
diffcmppath[i].df_choice_mem[j] = -1;
|
|
}
|
|
|
|
// memory for avoiding repetitive calculations of score
|
|
int df_iters[LN_MAX_BUFS];
|
|
populate_tensor(df_iters, 0, diffcmppath, diff_len, ndiffs, diff_blk,
|
|
iwhite);
|
|
|
|
const size_t u = unwrap_indexes(diff_len, diff_len, ndiffs);
|
|
diffcmppath_T *startNode = &diffcmppath[u];
|
|
|
|
*decisions = lalloc(sizeof(int) * memsize_decisions, TRUE);
|
|
size_t n_optimal = 0;
|
|
test_charmatch_paths(startNode, 0);
|
|
while (startNode->df_path_n > 0)
|
|
{
|
|
size_t j = startNode->df_optimal_choice;
|
|
(*decisions)[n_optimal++] = startNode->df_choice[j];
|
|
startNode = startNode->df_decision[j];
|
|
}
|
|
// reverse array
|
|
for (size_t i = 0; i < (n_optimal / 2); i++)
|
|
{
|
|
int tmp = (*decisions)[i];
|
|
(*decisions)[i] = (*decisions)[n_optimal - 1 - i];
|
|
(*decisions)[n_optimal - 1 - i] = tmp;
|
|
}
|
|
|
|
vim_free(diffcmppath);
|
|
|
|
return n_optimal;
|
|
}
|
|
|
|
// returns the minimum amount of path changes from start to end
|
|
static size_t
|
|
test_charmatch_paths(diffcmppath_T *node, int lastdecision)
|
|
{
|
|
// memoization
|
|
if (node->df_choice_mem[lastdecision] == -1)
|
|
{
|
|
if (node->df_path_n == 0)
|
|
// we have reached the end of the tree
|
|
node->df_choice_mem[lastdecision] = 0;
|
|
else
|
|
{
|
|
// the minimum amount of turns required to reach the end
|
|
size_t minimum_turns = SIZE_MAX;
|
|
for (size_t i = 0; i < node->df_path_n; i++)
|
|
{
|
|
// recurse
|
|
size_t t = test_charmatch_paths(node->df_decision[i],
|
|
node->df_choice[i]) +
|
|
(lastdecision != node->df_choice[i] ? 1 : 0);
|
|
if (t < minimum_turns)
|
|
{
|
|
node->df_optimal_choice = i;
|
|
minimum_turns = t;
|
|
}
|
|
}
|
|
node->df_choice_mem[lastdecision] = (int)minimum_turns;
|
|
}
|
|
}
|
|
|
|
return (size_t)node->df_choice_mem[lastdecision];
|
|
}
|