diff --git a/strings/README.md b/strings/README.md index 07bb97dd..093af2aa 100644 --- a/strings/README.md +++ b/strings/README.md @@ -24,6 +24,7 @@ 2. [All subsequences](java/sequence.java) 3. [KMP String Searching](java/kmp.cpp) 4. [Rabin Karp String Searching](java/rabin-karp.cpp) +5. [String Tokenizer](java/tokenizer.java) ### Python diff --git a/strings/java/tokenizer.java b/strings/java/tokenizer.java new file mode 100644 index 00000000..ce6c4dc6 --- /dev/null +++ b/strings/java/tokenizer.java @@ -0,0 +1,96 @@ +import java.util.ArrayList; +import java.util.List; + +public class tokenizer{ + + public static boolean charIsDelimiter(char c, char [] delimiters){ + //verrify if a character is a delimiter + for(char d: delimiters) + if(c == d) return true; + + return false; + } + + public static List tokenize(String str, char... delimiters){ + //by default the delimiter is white space ' ' + if(delimiters.length <= 0) delimiters = new char [] {' '}; + + List tokens = new ArrayList(); + String token = ""; + + for(int i = 0; i < str.length(); i++) { + char pos = str.charAt(i); + + if(!charIsDelimiter(pos, delimiters)) { + //if the character is not a delimiter add it into the current token + token += pos; + }else { + //avoid an empty token before adding to the list + if(!token.equals("")) + tokens.add(token); + + token = ""; + } + } + + //add the last token to the list + tokens.add(token); + + return tokens; + } + + public static void printTokens(List tokens){ + if(tokens == null) return; + + System.out.print("[ "); + for(String token : tokens){ + System.out.print("'" + token + "', "); + } + System.out.println("]"); + } + + public static void main(String [] args){ + String myString = "Hello I like pasta & pizza--hut"; + + System.out.println("myString = '" + myString + "'"); + + System.out.print("\ntokenize(myString) = "); + printTokens(tokenize(myString)); + + System.out.print("\ntokenize(myString, ' ', 'z') = "); + printTokens(tokenize(myString, ' ', 'z')); + + System.out.print("\ntokenize(myString, 'p','l', 'u') = "); + printTokens(tokenize(myString, 'p','l', 'u')); + + System.out.print("\ntokenize(myString, ' ', '&', '-') = "); + printTokens(tokenize(myString, ' ', '&', '-')); + } +} + +/* + to call the function: + tokenize(str, delimiters) + str is a text + delimiters is a list of char which is by default white space : ' ' + example: + tokenize("hello world") = tokenize("hello world", ' ') = [ 'hello', 'world' ] + tokenize("hello world", ' ', 'l') = [ 'he', 'o', 'wor', 'd' ] + + + to run this file: + javac tokenizer.java + java tokenizer + + result: + myString = 'Hello I like pasta & pizza--hut' + + tokenize(myString) = [ 'Hello', 'I', 'like', 'pasta', '&', 'pizza--hut', ] + + tokenize(myString, ' ', 'z') = [ 'Hello', 'I', 'like', 'pasta', '&', 'pi', 'a--hut', ] + + tokenize(myString, 'p','l', 'u') = [ 'He', 'o I ', 'ike ', 'asta & ', 'izza--h', 't', ] + + tokenize(myString, ' ', '&', '-') = [ 'Hello', 'I', 'like', 'pasta', 'pizza', 'hut', ] + +*/