Redesign query sanitizer (#618)

2023-06-10 07:56:18 -06:00 · 2023-06-10 07:56:18 -06:00 · 541cf79b6f
commit 541cf79b6f
parent 55fa82f92e
5 changed files with 181 additions and 288 deletions
--- a/Source/LibationSearchEngine/QuerySanitizer.cs
+++ b/Source/LibationSearchEngine/QuerySanitizer.cs
@ -0,0 +1,153 @@
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Analysis.Tokenattributes;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace LibationSearchEngine
+{
+	internal static class QuerySanitizer
+	{
+		private static readonly HashSet<string> idTerms
+				= SearchEngine.idIndexRules.Keys
+				.Select(s => s.ToLowerInvariant())
+				.ToHashSet();
+
+		private static readonly HashSet<string> boolTerms
+				= SearchEngine.boolIndexRules.Keys
+				.Select(s => s.ToLowerInvariant())
+				.ToHashSet();
+
+		private static readonly HashSet<string> fieldTerms
+				= SearchEngine.stringIndexRules.Keys
+				.Union(SearchEngine.numberIndexRules.Keys)
+				.Select(s => s.ToLowerInvariant())
+				.Union(idTerms)
+				.Union(boolTerms)
+				.ToHashSet();
+
+		internal static string Sanitize(string searchString, StandardAnalyzer analyzer)
+		{
+			if (string.IsNullOrWhiteSpace(searchString))
+				return SearchEngine.ALL_QUERY;
+
+			// range operator " TO " and bool operators " AND " and " OR " must be uppercase
+			searchString
+				= searchString
+				.Replace(" to ", " TO ", System.StringComparison.OrdinalIgnoreCase)
+				.Replace(" and ", " AND ", System.StringComparison.OrdinalIgnoreCase)
+				.Replace(" or ", " OR ", System.StringComparison.OrdinalIgnoreCase);
+
+			using var tokenStream = analyzer.TokenStream(SearchEngine.ALL, new System.IO.StringReader(searchString));
+
+			var partList = new List<string>();
+			int previousEndOffset = 0;
+			bool previousIsBool = false, previousIsTags = false, previousIsAsin = false;
+
+			while (tokenStream.IncrementToken())
+			{
+				var term = tokenStream.GetAttribute<ITermAttribute>().Term;
+				var offset = tokenStream.GetAttribute<IOffsetAttribute>();
+
+				if (previousIsBool && !bool.TryParse(term, out _))
+				{
+					//The previous term was a boolean tag and this term is NOT a bool value
+					//Add the default ":True" bool and continue parsing the current term
+					partList.Add(":True");
+					previousIsBool = false;
+				}
+
+				//Add all text between the current token and the previous token
+				partList.Add(searchString.Substring(previousEndOffset, offset.StartOffset - previousEndOffset));
+
+				if (previousIsBool)
+				{
+					//The previous term was a boolean tag and this term is a bool value
+					addUnalteredToken(offset);
+					previousIsBool = false;
+				}
+				else if (previousIsAsin)
+				{
+					//The previous term was an ASIN field ID, so this term is an ASIN
+					partList.Add(term);
+					previousIsAsin = false;
+				}
+				else if (previousIsTags)
+				{
+					//This term is a tag. Do this check before checking if term is a defined field
+					//so that "tags:israted" does not parse as a bool
+					addUnalteredToken(offset);
+					previousIsTags = false;
+				}
+				else if (tryParseBlockTag(offset, partList, searchString, out var tagName))
+				{
+					//The term is a block tag. add it to the part list
+					partList.Add($"{SearchEngine.TAGS}:{tagName}");
+				}
+				else if (double.TryParse(term, out var num))
+				{
+					//Term is a number so pad it with zeros
+					partList.Add(num.ToLuceneString());
+				}
+				else if (fieldTerms.Contains(term))
+				{
+					//Term is a defined search field, add it.
+					//The StandardAnalyzer already converts all terms to lowercase
+					partList.Add(term);
+					previousIsBool = boolTerms.Contains(term);
+					previousIsAsin = idTerms.Contains(term);
+					previousIsTags = term == SearchEngine.TAGS;
+				}
+				else
+				{
+					//Term is any other user-defined constant value
+					addUnalteredToken(offset);
+				}
+
+				previousEndOffset = offset.EndOffset;
+			}
+
+			if (previousIsBool)
+				partList.Add(":True");
+
+			//Add ending non-token text
+			partList.Add(searchString.Substring(previousEndOffset, searchString.Length - previousEndOffset));
+
+			return string.Concat(partList);
+
+			//Add the full, unaltered token as well as all inter-token text
+			void addUnalteredToken(IOffsetAttribute offset) =>
+				partList.Add(searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset));			
+		}
+
+		private static bool tryParseBlockTag(IOffsetAttribute offset, List<string> partList, string searchString, out string tagName)
+		{
+			tagName = null;
+			if (partList.Count == 0) return false;
+
+			var previous = partList[^1].TrimEnd();
+
+			//cannot be preceeded by an escaping \
+			if (previous.Length == 0) return false;
+			if (previous[^1] != '[' || (previous.Length > 1 && previous[^2] == '\\')) return false;
+
+			var next = searchString.Substring(offset.EndOffset);
+			if (next.Length == 0 || !next.TrimStart().StartsWith(']')) return false;
+
+			tagName = searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset);
+
+			//Only legal tag characters are letters, numbers and underscores
+			//Per DataLayer.UserDefinedItem.IllegalCharacterRegex()
+			foreach (var c in tagName)
+			{
+				if (!char.IsLetterOrDigit(c) && c != '_')
+					return false;
+			}
+
+			//Remove the leading '['
+			partList[^1] = previous[..^1];
+			//Ignore the trailing ']'
+			offset.SetOffset(offset.StartOffset, searchString.IndexOf(']', offset.EndOffset) + 1);
+			return true;
+		}
+	}
+}