-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstring-processing.html
1090 lines (804 loc) · 46 KB
/
string-processing.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>String Processing</title>
<script type="text/javascript">
window.onload = function() {
var imgs = document.getElementsByTagName('img'), i, img;
for (i = 0; i < imgs.length; i++) {
img = imgs[i];
// center an image if it is the only element of its parent
if (img.parentElement.childElementCount === 1)
img.parentElement.style.textAlign = 'center';
}
};
</script>
<!-- Styles for R syntax highlighter -->
<style type="text/css">
pre .operator,
pre .paren {
color: rgb(104, 118, 135)
}
pre .literal {
color: #990073
}
pre .number {
color: #099;
}
pre .comment {
color: #998;
font-style: italic
}
pre .keyword {
color: #900;
font-weight: bold
}
pre .identifier {
color: rgb(0, 0, 0);
}
pre .string {
color: #d14;
}
</style>
<!-- R syntax highlighter -->
<script type="text/javascript">
var hljs=new function(){function m(p){return p.replace(/&/gm,"&").replace(/</gm,"<")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.length-1,M);if(L){var O=R.cN?"</span>":"";if(R.rE){y+=J(R.buffer+N,R)+O}else{if(R.eE){y+=J(R.buffer+N,R)+O+m(M)}else{y+=J(R.buffer+N+M,R)+O}}while(L>1){O=D[D.length-2].cN?"</span>":"";y+=O;L--;D.length--}var r=D[D.length-1];D.length--;D[D.length-1].buffer="";if(r.starts){I(r.starts,"")}return R.rE}if(w(M,R)){throw"Illegal"}}var E=e[B];var D=[E.dM];var A=0;var x=0;var y="";try{var s,u=0;E.dM.buffer="";do{s=p(C,u);var t=G(s[0],s[1],s[2]);u+=s[0].length;if(!t){u+=s[1].length}}while(!s[2]);if(D.length>1){throw"Illegal"}return{r:A,keyword_count:x,value:y}}catch(H){if(H=="Illegal"){return{r:0,keyword_count:0,value:m(C)}}else{throw H}}}function g(t){var p={keyword_count:0,r:0,value:m(t)};var r=p;for(var q in e){if(!e.hasOwnProperty(q)){continue}var s=d(q,t);s.language=q;if(s.keyword_count+s.r>r.keyword_count+r.r){r=s}if(s.keyword_count+s.r>p.keyword_count+p.r){r=p;p=s}}if(r.language){p.second_best=r}return p}function i(r,q,p){if(q){r=r.replace(/^((<[^>]+>|\t)+)/gm,function(t,w,v,u){return w.replace(/\t/g,q)})}if(p){r=r.replace(/\n/g,"<br>")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="<pre><code>"+y.value+"</code></pre>";t=p.firstChild.firstChild;p.firstChild.cN=s.cN;s.parentNode.replaceChild(p.firstChild,s)}else{t.innerHTML=y.value}t.className=u;t.result={language:v,kw:y.keyword_count,re:y.r};if(y.second_best){t.second_best={language:y.second_best.language,kw:y.second_best.keyword_count,re:y.second_best.r}}}function o(){if(o.called){return}o.called=true;var r=document.getElementsByTagName("pre");for(var p=0;p<r.length;p++){var q=b(r[p]);if(q){n(q,hljs.tabReplace)}}}function l(){if(window.addEventListener){window.addEventListener("DOMContentLoaded",o,false);window.addEventListener("load",o,false)}else{if(window.attachEvent){window.attachEvent("onload",o)}else{window.onload=o}}}var e={};this.LANGUAGES=e;this.highlight=d;this.highlightAuto=g;this.fixMarkup=i;this.highlightBlock=n;this.initHighlighting=o;this.initHighlightingOnLoad=l;this.IR="[a-zA-Z][a-zA-Z0-9_]*";this.UIR="[a-zA-Z_][a-zA-Z0-9_]*";this.NR="\\b\\d+(\\.\\d+)?";this.CNR="\\b(0[xX][a-fA-F0-9]+|(\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)";this.BNR="\\b(0b[01]+)";this.RSR="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|\\.|-|-=|/|/=|:|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~";this.ER="(?![\\s\\S])";this.BE={b:"\\\\.",r:0};this.ASM={cN:"string",b:"'",e:"'",i:"\\n",c:[this.BE],r:0};this.QSM={cN:"string",b:'"',e:'"',i:"\\n",c:[this.BE],r:0};this.CLCM={cN:"comment",b:"//",e:"$"};this.CBLCLM={cN:"comment",b:"/\\*",e:"\\*/"};this.HCM={cN:"comment",b:"#",e:"$"};this.NM={cN:"number",b:this.NR,r:0};this.CNM={cN:"number",b:this.CNR,r:0};this.BNM={cN:"number",b:this.BNR,r:0};this.inherit=function(r,s){var p={};for(var q in r){p[q]=r[q]}if(s){for(var q in s){p[q]=s[q]}}return p}}();hljs.LANGUAGES.cpp=function(){var a={keyword:{"false":1,"int":1,"float":1,"while":1,"private":1,"char":1,"catch":1,"export":1,virtual:1,operator:2,sizeof:2,dynamic_cast:2,typedef:2,const_cast:2,"const":1,struct:1,"for":1,static_cast:2,union:1,namespace:1,unsigned:1,"long":1,"throw":1,"volatile":2,"static":1,"protected":1,bool:1,template:1,mutable:1,"if":1,"public":1,friend:2,"do":1,"return":1,"goto":1,auto:1,"void":2,"enum":1,"else":1,"break":1,"new":1,extern:1,using:1,"true":1,"class":1,asm:1,"case":1,typeid:1,"short":1,reinterpret_cast:2,"default":1,"double":1,register:1,explicit:1,signed:1,typename:1,"try":1,"this":1,"switch":1,"continue":1,wchar_t:1,inline:1,"delete":1,alignof:1,char16_t:1,char32_t:1,constexpr:1,decltype:1,noexcept:1,nullptr:1,static_assert:1,thread_local:1,restrict:1,_Bool:1,complex:1},built_in:{std:1,string:1,cin:1,cout:1,cerr:1,clog:1,stringstream:1,istringstream:1,ostringstream:1,auto_ptr:1,deque:1,list:1,queue:1,stack:1,vector:1,map:1,set:1,bitset:1,multiset:1,multimap:1,unordered_set:1,unordered_map:1,unordered_multiset:1,unordered_multimap:1,array:1,shared_ptr:1}};return{dM:{k:a,i:"</",c:[hljs.CLCM,hljs.CBLCLM,hljs.QSM,{cN:"string",b:"'\\\\?.",e:"'",i:"."},{cN:"number",b:"\\b(\\d+(\\.\\d*)?|\\.\\d+)(u|U|l|L|ul|UL|f|F)"},hljs.CNM,{cN:"preprocessor",b:"#",e:"$"},{cN:"stl_container",b:"\\b(deque|list|queue|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<",e:">",k:a,r:10,c:["self"]}]}}}();hljs.LANGUAGES.r={dM:{c:[hljs.HCM,{cN:"number",b:"\\b0[xX][0-9a-fA-F]+[Li]?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+(?:[eE][+\\-]?\\d*)?L\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+\\.(?!\\d)(?:i\\b)?",e:hljs.IMMEDIATE_RE,r:1},{cN:"number",b:"\\b\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"keyword",b:"(?:tryCatch|library|setGeneric|setGroupGeneric)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\.",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\d+(?![\\w.])",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\b(?:function)",e:hljs.IMMEDIATE_RE,r:2},{cN:"keyword",b:"(?:if|in|break|next|repeat|else|for|return|switch|while|try|stop|warning|require|attach|detach|source|setMethod|setClass)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"literal",b:"(?:NA|NA_integer_|NA_real_|NA_character_|NA_complex_)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"literal",b:"(?:NULL|TRUE|FALSE|T|F|Inf|NaN)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"identifier",b:"[a-zA-Z.][a-zA-Z0-9._]*\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"<\\-(?!\\s*\\d)",e:hljs.IMMEDIATE_RE,r:2},{cN:"operator",b:"\\->|<\\-",e:hljs.IMMEDIATE_RE,r:1},{cN:"operator",b:"%%|~",e:hljs.IMMEDIATE_RE},{cN:"operator",b:">=|<=|==|!=|\\|\\||&&|=|\\+|\\-|\\*|/|\\^|>|<|!|&|\\||\\$|:",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"%",e:"%",i:"\\n",r:1},{cN:"identifier",b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[hljs.BE],r:0},{cN:"string",b:"'",e:"'",c:[hljs.BE],r:0},{cN:"paren",b:"[[({\\])}]",e:hljs.IMMEDIATE_RE,r:0}]}};
hljs.initHighlightingOnLoad();
</script>
<style type="text/css">
body, td {
font-family: sans-serif;
background-color: white;
font-size: 13px;
}
body {
max-width: 800px;
margin: auto;
padding: 1em;
line-height: 20px;
}
tt, code, pre {
font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace;
}
h1 {
font-size:2.2em;
}
h2 {
font-size:1.8em;
}
h3 {
font-size:1.4em;
}
h4 {
font-size:1.0em;
}
h5 {
font-size:0.9em;
}
h6 {
font-size:0.8em;
}
a:visited {
color: rgb(50%, 0%, 50%);
}
pre, img {
max-width: 100%;
}
pre {
overflow-x: auto;
}
pre code {
display: block; padding: 0.5em;
}
code {
font-size: 92%;
border: 1px solid #ccc;
}
code[class] {
background-color: #F8F8F8;
}
table, td, th {
border: none;
}
blockquote {
color:#666666;
margin:0;
padding-left: 1em;
border-left: 0.5em #EEE solid;
}
hr {
height: 0px;
border-bottom: none;
border-top-width: thin;
border-top-style: dotted;
border-top-color: #999999;
}
@media print {
* {
background: transparent !important;
color: black !important;
filter:none !important;
-ms-filter: none !important;
}
body {
font-size:12pt;
max-width:100%;
}
a, a:visited {
text-decoration: underline;
}
hr {
visibility: hidden;
page-break-before: always;
}
pre, blockquote {
padding-right: 1em;
page-break-inside: avoid;
}
tr, img {
page-break-inside: avoid;
}
img {
max-width: 100% !important;
}
@page :left {
margin: 15mm 20mm 15mm 10mm;
}
@page :right {
margin: 15mm 10mm 15mm 20mm;
}
p, h2, h3 {
orphans: 3; widows: 3;
}
h2, h3 {
page-break-after: avoid;
}
}
</style>
</head>
<body>
<h1>String Processing</h1>
<h2>Manipulating Text Data in R and Python, including Regular Expressions </h2>
<p>Chris Paciorek, Department of Statistics, UC Berkeley</p>
<h1>0) This Tutorial</h1>
<p>This tutorial covers tools for manipulating text data in R and Python, including the use of regular expressions. We also briefly discuss tools for reading and manipulating formatted text files such as HTML, XML, and JSON. At the moment, this tutorial is somewhat more focused on R than Python, but we hope to flesh out the Python sections more fully in the future.</p>
<p>If you have a standard R or Python installation and can install the <em>stringr</em> package for R and the <em>re</em> package for Python, you should be able to reproduce the results in this document.</p>
<p>This tutorial was originally developed using a virtual machine developed here at Berkeley, the <a href="http://bce.berkeley.edu">Berkeley Common Environment (BCE)</a>. BCE is a virtual Linux machine - basically it is a Linux computer that you can run within your own computer, regardless of whether you are using Windows, Mac, or Linux. This provides a common environment so that things behave the same for all of us. However, BCE has not been updated in a while, so I don't suggest you use it at this point in time.</p>
<p>This tutorial assumes you have a working knowledge of R or Python. </p>
<p>Materials for this tutorial, including the R markdown file and associated code files that were used to create this document are available on Github at (<a href="https://github.com/berkeley-scf/tutorial-string-processing">https://github.com/berkeley-scf/tutorial-string-processing</a>). You can download the files by doing a git clone from a terminal window on a UNIX-like machine, as follows:</p>
<pre><code class="r">git clone https://github.com/berkeley-scf/tutorial-string-processing
</code></pre>
<p>To create this HTML document, simply compile the corresponding R Markdown file in R as follows (the following will work from within BCE after cloning the repository as above).</p>
<pre><code class="r">Rscript -e "library(knitr); knit2html('string-processing.Rmd')"
</code></pre>
<p>This tutorial by Christopher Paciorek is licensed under a Creative Commons Attribution 3.0 Unported License.</p>
<h1>1) Background</h1>
<p>Text manipulations in R, Python, Perl, and bash have a number of things
in common, as many of these evolved from UNIX. When I use the
term <em>string</em> here, I'll be referring to any sequence of characters
that may include numbers, white space, and special characters. Note that in R
a character vector is a vector of one or more such strings. </p>
<h1>2) Basic text manipulation</h1>
<p>Some of the basic things we need to do are paste/concatenate strings together,
split strings apart, take subsets of strings, and replace characters within strings.
Often these operations are done based on patterns rather than a fixed string
sequence. This involves the use of regular expressions, covered in Section 3.</p>
<h2>2.1) R</h2>
<p>In general, strings in R are stored in character vectors. R's functions for string manipulation are fully vectorized and will work on all of the strings in a vector at once.</p>
<p>Here's a <a href="https://github.com/rstudio/cheatsheets/raw/master/strings.pdf">cheatsheet from RStudio</a> on manipulating strings in R.</p>
<h3>2.1.1) String manipulation in base R</h3>
<p>A few of the basic R functions for manipulating strings are <em>paste</em>,
<em>strsplit</em>, and <em>substring</em>. <em>paste</em> and <em>strsplit</em>
are basically inverses of each other: <em>paste</em> concatenates
together an arbitrary set of strings (or a vector, if using the <em>collapse</em>
argument) with a user-specified separator character, while <em>strsplit</em>
splits apart based on a delimiter/separator. <em>substring</em>
splits apart the elements of a character vector based on fixed widths.
<em>nchar</em> returns the number of characters in a string.
Note that all of these operate in a vectorized fashion.</p>
<pre><code class="r">out <- paste("My", "name", "is", "Chris", ".", sep = " ")
paste(c("My", "name", "is", "Chris", "."), collapse = " ") # equivalent
</code></pre>
<pre><code>## [1] "My name is Chris ."
</code></pre>
<pre><code class="r">strsplit(out, split = ' ')
</code></pre>
<pre><code>## [[1]]
## [1] "My" "name" "is" "Chris" "."
</code></pre>
<pre><code class="r">nchar(out)
</code></pre>
<pre><code>## [1] 18
</code></pre>
<p>Note that <em>strsplit</em> returns a list because it can operate
on a character vector (i.e., on multiple strings).</p>
<pre><code class="r">times <- c("04:18:04", "12:12:53", "13:47:00")
substring(times, 7, 8)
</code></pre>
<pre><code>## [1] "04" "53" "00"
</code></pre>
<pre><code class="r">substring(times[3], 1, 2) <- '01' ## replacement
times
</code></pre>
<pre><code>## [1] "04:18:04" "12:12:53" "01:47:00"
</code></pre>
<p>To identify particular subsequences in strings, there are several
related R functions. <em>grep</em> will look for a specified string
within an R character vector and report back indices identifying the
elements of the vector in which the string was found. Note that using the
<code>fixed=TRUE</code> argument ensures that regular expressions are NOT
used. <em>gregexpr</em> will indicate the position in each string
that the specified string is found (use <em>regexpr</em> if you only
want the first occurrence). <em>gsub</em> can be used to replace a
specified string with a replacement string (use <em>sub</em> if you
only want to replace only the first occurrence). </p>
<pre><code class="r">dates <- c("2016-08-03", "2007-09-05", "2016-01-02")
grep("2016", dates)
</code></pre>
<pre><code>## [1] 1 3
</code></pre>
<pre><code class="r">gregexpr("2016", dates)
</code></pre>
<pre><code>## [[1]]
## [1] 1
## attr(,"match.length")
## [1] 4
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[2]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[3]]
## [1] 1
## attr(,"match.length")
## [1] 4
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
</code></pre>
<pre><code class="r">gsub("2016", "16", dates)
</code></pre>
<pre><code>## [1] "16-08-03" "2007-09-05" "16-01-02"
</code></pre>
<h3>2.1.2) String manipulation using <em>stringr</em></h3>
<p>The <em>stringr</em> package wraps the various core string manipulation
functions to provide a common interface. It also removes some of the
clunkiness involved in some of the string operations with the base
string functions, such as having to to call <em>gregexpr</em> and
then <em>regmatches</em> to pull out the matched strings. In general, I'd suggest using <em>stringr</em> functions
in place of R's base string functions.</p>
<p>Here's </p>
<p>First let's see <em>stringr</em>'s versions of some of the base string functions mentioned in the previous sections.</p>
<p>The basic interface to <em>stringr</em> functions is <code>function(strings, pattern, [replacement])</code>. </p>
<p>Table 1 provides an overview of the key functions related to working with patterns, which are basically
wrappers for <em>grep</em>, <em>gsub</em>, <em>gregexpr</em>, etc.</p>
<table><thead>
<tr>
<th>Function</th>
<th>What it does</th>
</tr>
</thead><tbody>
<tr>
<td>str_detect</td>
<td>detects pattern, returning TRUE/FALSE</td>
</tr>
<tr>
<td>str_count</td>
<td>counts matches</td>
</tr>
<tr>
<td>str_locate/str_locate_all</td>
<td>detects pattern, returning positions of matching characters</td>
</tr>
<tr>
<td>str_extract/str_extract_all</td>
<td>detects pattern, returning matches</td>
</tr>
<tr>
<td>str_replace/str_replace_all</td>
<td>detects pattern and replaces matches</td>
</tr>
</tbody></table>
<p>The analog of <em>regexpr</em> vs. <em>gregexpr</em> and <em>sub</em>
vs. <em>gsub</em> is that most of the functions have versions that
return all the matches, not just the first match, e.g. <em>str_locate_all</em>
<em>str_extract_all</em>, etc. Note that the <em>_all</em> functions return
lists while the non-<em>_all</em> functions return vectors.</p>
<p>To specify options, you can wrap these functions around the pattern
argument: <code>fixed(pattern, ignore_case)</code> and <code>regex(pattern, ignore_case)</code>.
The default is <em>regex</em>, so you only need to specify that if you also want to
specify additional arguments, such as <em>ignore_case</em> or others listed under <code>help(regex)</code> (invoke the help after loading <em>stringr</em>)</p>
<p>Here's an example:</p>
<pre><code class="r">library(stringr)
str <- c("Apple Computer", "IBM", "Apple apps")
str_locate(str, fixed("app", ignore_case = TRUE))
</code></pre>
<pre><code>## start end
## [1,] 1 3
## [2,] NA NA
## [3,] 1 3
</code></pre>
<pre><code class="r">str_locate_all(str, fixed("app", ignore_case = TRUE))
</code></pre>
<pre><code>## [[1]]
## start end
## [1,] 1 3
##
## [[2]]
## start end
##
## [[3]]
## start end
## [1,] 1 3
## [2,] 7 9
</code></pre>
<pre><code class="r">dates <- c("2016-08-03", "2007-09-05", "2016-01-02")
str_locate(dates, "20[^0][0-9]") ## regular expression: years begin in 2010
</code></pre>
<pre><code>## start end
## [1,] 1 4
## [2,] NA NA
## [3,] 1 4
</code></pre>
<h2>2.2) Basic text manipulation in Python</h2>
<p>Let's see basic concatenation, splitting, working with substrings, and searching/replacing
substrings. Notice that Python's string functionality is object-oriented (though <em>len</em> is not).
Note: apologies for all the extra print statements in the code - this is required when running Python
chunks in R Markdown.</p>
<pre><code class="python">print( "My" + "name" + "is" + "Chris" + "." )
</code></pre>
<pre><code>## MynameisChris.
</code></pre>
<pre><code class="python">out = ' '.join(("My", "name", "is", "Chris", "."))
print(out)
</code></pre>
<pre><code>## My name is Chris .
</code></pre>
<pre><code class="python">print( len(out) )
</code></pre>
<pre><code>## 18
</code></pre>
<pre><code class="python">print( out.split(' ') )
</code></pre>
<pre><code>## ['My', 'name', 'is', 'Chris', '.']
</code></pre>
<p>In IPython, hitting tab after typing <code>out.</code> when <em>out</em> is a string will show the full suite of string-related methods.</p>
<p>Unlike in R, you cannot use the string methods directly on a list or tuple of strings, but you of course can do things like list comprehension to easily process multiple strings.</p>
<p>Working with substrings relies on the fact that Python works with strings as if they are vectors of individual characters.</p>
<pre><code class="python">var = "13:47:00"
print( var[3:5] )
</code></pre>
<pre><code>## 47
</code></pre>
<p>However strings are immutable - you cannot alter a subset of characters in the string. Another option is to work with strings as lists.</p>
<pre><code class="python">var = list("13:47:00")
print( var )
</code></pre>
<pre><code>## ['1', '3', ':', '4', '7', ':', '0', '0']
</code></pre>
<pre><code class="python">var[0:2] = ["0", "1"]
print( ''.join(var) )
</code></pre>
<pre><code>## 01:47:00
</code></pre>
<p>Now let's consider finding substrings.</p>
<pre><code class="python">vars = ["08-03-2016", "09-05-2007", "01-02-2016"]
print( vars[0].find('2016') )
</code></pre>
<pre><code>## 6
</code></pre>
<pre><code class="python">print( vars[1].find('2016') )
</code></pre>
<pre><code>## -1
</code></pre>
<pre><code class="python">print( [v.find('2016') for v in vars] )
</code></pre>
<pre><code>## [6, -1, 6]
</code></pre>
<h1>3) Regular expressions (regex/regexp)</h1>
<p>Regular expressions are a domain-specific language for
finding patterns and are one of the key functionalities in scripting
languages such as Perl and Python, as well as the UNIX utilities <em>sed</em>,
<em>awk</em> and <em>grep</em>. </p>
<p>The basic idea of regular expressions is that they allow us to find
matches of strings or patterns in strings, as well as do substitution.
Regular expressions are good for tasks such as:</p>
<ul>
<li>extracting pieces of text - for example finding all the links in an html document;</li>
<li> creating variables from information found in text;</li>
<li> cleaning and transforming text into a uniform format;</li>
<li> mining text by treating documents as data; and</li>
<li> scraping the web for data.</li>
</ul>
<p>Please look at Section 3 of <a href="https://github.com/berkeley-scf/tutorial-using-bash">our tutorial on using the bash shell</a> to learn the regular expression syntax that we'll use in the remainder of this tutorial. For other resources, Duncan Temple Lang (UC Davis Statistics)
has written a nice tutorial that is part of the repository for this tutorial
or check out Sections 9.9 and 11 of <a href="http://www.stat.auckland.ac.nz/%7Epaul/ItDT">Paul Murrell's book</a></p>
<p>Also, here's a <a href="https://www.rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf">cheatsheet on regular expressions</a> and here is a <a href="https://regex101.com">website where you can interactively test regular expressions on example strings</a>.</p>
<h2>3.1) Versions of regular expressions</h2>
<p>One thing that can cause headaches is differences in version of regular expression syntax used. As discussed in <code>man grep</code>, <em>extended regular expressions</em> are standard, with <em>basic regular expressions</em> providing somewhat less functionality and <em>Perl regular expressions</em> additional functionality.
As can be seen in <code>help(regex)</code>, In R, <em>stringr</em> provides <em>ICU regular expressions</em>, which are based on Perl regular expressions. More details can be found in the <a href="https://en.wikipedia.org/wiki/Regular_expression">regex Wikipedia page</a>.</p>
<p>The tutorial on using bash provides a full documentation of the various <em>extended regular expressions</em> syntax, which we'll focus on here. This should be sufficient for most usage and should be usable in R and Python, but if you notice something funny going on, it might be due to differences between the regular expressions versions. </p>
<h2>3.2) General principles for working with regex</h2>
<p>The syntax is very concise, so it's helpful to break down
individual regular expressions into the component parts to understand
them. As Murrell notes, since regex are their own language, it's
a good idea to build up a regex in pieces as a way of avoiding errors
just as we would with any computer code. <em>str_detect</em> in R's <em>stringr</em> and <em>re.findall</em> in Python are particularly
useful in seeing <em>what</em> was matched to help in understanding
and learning regular expression syntax and debugging your regex. As with
many kinds of coding, I find that debugging my regex is usually what takes
most of my time.</p>
<h2>3.3) Using regex in R</h2>
<p>The <em>grep</em>, <em>gregexpr</em> and <em>gsub</em> functions and
their <em>stringr</em> analogs are more powerful when used with regular
expressions. In the following examples, we'll illustrate usage of <em>stringr</em> functions, but
with their base R analogs as comments.</p>
<h3>3.3.1) Working with patterns</h3>
<p>First let's see the use of character sets and character classes.</p>
<pre><code class="r">text <- c("Here's my number: 919-543-3300.", "hi John, good to meet you",
"They bought 731 bananas", "Please call 919.554.3800")
str_detect(text, "[[:digit:]]")
</code></pre>
<pre><code>## [1] TRUE FALSE TRUE TRUE
</code></pre>
<pre><code class="r">## grep("[[:digit:]]", text, perl = TRUE)
</code></pre>
<pre><code class="r">str_detect(text, "[:,\t.]")
</code></pre>
<pre><code>## [1] TRUE TRUE FALSE TRUE
</code></pre>
<pre><code class="r">## grep("[:,\t.]", text)
str_locate_all(text, "[:,\t.]")
</code></pre>
<pre><code>## [[1]]
## start end
## [1,] 17 17
## [2,] 31 31
##
## [[2]]
## start end
## [1,] 8 8
##
## [[3]]
## start end
##
## [[4]]
## start end
## [1,] 16 16
## [2,] 20 20
</code></pre>
<pre><code class="r">## gregexpr("[:,\t.]", text)
str_extract_all(text, "[[:digit:]]+")
</code></pre>
<pre><code>## [[1]]
## [1] "919" "543" "3300"
##
## [[2]]
## character(0)
##
## [[3]]
## [1] "731"
##
## [[4]]
## [1] "919" "554" "3800"
</code></pre>
<pre><code class="r">## matches <- gregexpr("[[:digit]]+", text)
## regmatches(text, matches)
str_replace_all(text, "[[:digit:]]", "Z")
</code></pre>
<pre><code>## [1] "Here's my number: ZZZ-ZZZ-ZZZZ." "hi John, good to meet you"
## [3] "They bought ZZZ bananas" "Please call ZZZ.ZZZ.ZZZZ"
</code></pre>
<pre><code class="r">## gsub("[[:digit:]]", "Z", text)
</code></pre>
<p>Challenge: how would we find a spam-like pattern with digits or non-letters inside a word? E.g., I want to find “V1agra” or “Fancy repl!c@ted watches”.</p>
<p>Next let's consider location-specific matches.</p>
<pre><code class="r">str_detect(text, "^[[:upper:]]") # text starting with upper case letter
</code></pre>
<pre><code>## [1] TRUE FALSE TRUE TRUE
</code></pre>
<pre><code class="r">## grep("^[[:upper:]]", text)
str_detect(text, "[[:digit:]]$") # text with a digit
</code></pre>
<pre><code>## [1] FALSE FALSE FALSE TRUE
</code></pre>
<pre><code class="r">## grep("[[:digit:]]$", text)
</code></pre>
<p>Now let's make use of repetitions.</p>
<p>Let's search for US/Canadian/Caribbean phone numbers in the example text we've been using: </p>
<pre><code class="r">text <- c("Here's my number: 919-543-3300.", "hi John, good to meet you",
"They bought 731 bananas", "Please call 919.554.3800")
pattern <- "[[:digit:]]{3}[-.][[:digit:]]{3}[-.][[:digit:]]{4}"
str_extract_all(text, pattern)
</code></pre>
<pre><code>## [[1]]
## [1] "919-543-3300"
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## [1] "919.554.3800"
</code></pre>
<pre><code class="r">## matches <- gregexpr(pattern, text)
## regmatches(text, matches)
</code></pre>
<p>Challenge: How would I extract an email address from an arbitrary text string?</p>
<p>Next consider grouping.</p>
<p>For example, the phone number detection problem could have been done a bit more compactly (and more generally, in case the area code is omitted or a 1 is included) as:</p>
<pre><code class="r">str_extract_all(text, "(1[-.])?([[:digit:]]{3}[-.]){1,2}[[:digit:]]{4}")
</code></pre>
<pre><code>## [[1]]
## [1] "919-543-3300"
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## [1] "919.554.3800"
</code></pre>
<pre><code class="r">## matches <- gregexpr("(1[-.])?([[:digit:]]{3}[-.]){1,2}[[:digit:]]{4}", text)
## regmatches(text, matches)
</code></pre>
<p>Challenge: the above pattern would actually match something that is not a valid phone number. What can go wrong?</p>
<p>Here's a basic example of using grouping via parentheses with the OR operator.</p>
<pre><code class="r">text <- c("at the site http://www.ibm.com", "other text", "ftp://ibm.com")
str_locate(text, "(http|ftp):\\/\\/")
</code></pre>
<pre><code>## start end
## [1,] 13 19
## [2,] NA NA
## [3,] 1 6
</code></pre>
<pre><code class="r">## gregexpr("(http|ftp):\\/\\/", text)
</code></pre>
<p>Parentheses are also used for referencing back to a detected pattern when doing a replacement. For example, here we'll find any numbers and add underscores before and after them:</p>
<pre><code class="r">text <- c("Here's my number: 919-543-3300.", "hi John, good to meet you",
"They bought 731 bananas", "Please call 919.554.3800")
str_replace_all(text, "([0-9]+)", "_\\1_")
</code></pre>
<pre><code>## [1] "Here's my number: _919_-_543_-_3300_."
## [2] "hi John, good to meet you"
## [3] "They bought _731_ bananas"
## [4] "Please call _919_._554_._3800_"
</code></pre>
<p>Here we'll remove commas not used as field separators.</p>
<pre><code class="r">text <- ('"H4NY07011","ACKERMAN, GARY L.","H","$13,242",,,')
clean_text <- str_replace_all(text, "([^\",]),", "\\1")
clean_text
</code></pre>
<pre><code>## [1] "\"H4NY07011\",\"ACKERMAN GARY L.\",\"H\",\"$13242\",,,"
</code></pre>
<pre><code class="r">cat(clean_text)
</code></pre>
<pre><code>## "H4NY07011","ACKERMAN GARY L.","H","$13242",,,
</code></pre>
<pre><code class="r">## gsub("([^\",]),", "\\1", text)
</code></pre>
<p>Challenge: Suppose a text string has dates in the form “Aug-3”, “May-9”, etc. and I want them in the form “3 Aug”, “9 May”, etc. How would I do this search/replace?</p>
<p>Finally let's consider greedy matching.</p>
<pre><code class="r">text <- "Do an internship <b> in place </b> of <b> one </b> course."
str_replace_all(text, "<.*>", "")
</code></pre>
<pre><code>## [1] "Do an internship course."
</code></pre>
<pre><code class="r">## gsub("<.*>", "", text)
</code></pre>
<p>What went wrong?</p>
<p>One solution is to append a ? to the repetition syntax to cause the matching to be non-greedy. Here's an example.</p>
<pre><code class="r">str_replace_all(text, "<.*?>", "")
</code></pre>
<pre><code>## [1] "Do an internship in place of one course."
</code></pre>
<pre><code class="r">## gsub("<.*?>", "", text)
</code></pre>
<p>However, one can often avoid greedy matching by being more clever. </p>
<p>Challenge: How could we change our regexp to avoid the greedy matching without using the “?”?</p>
<h3>3.3.2 Other comments</h3>
<p>If we are working with newlines embedded in a string, we can include the newline character as a regular character that is matched by a “.” by first creating the regular expression with <em>stringr::regex</em> with the <em>dotall</em> argument set to <code>TRUE</code>:</p>
<pre><code class="r">myex <- regex("<p>.*</p>", dotall = TRUE)
html_string <- "And <p>here is some\ninformation</p> for you."
str_extract(html_string, myex)
</code></pre>
<pre><code>## [1] "<p>here is some\ninformation</p>"
</code></pre>
<pre><code class="r">str_extract(html_string, "<p>.*</p>")
</code></pre>
<pre><code>## [1] NA
</code></pre>
<p>Regular expression can be used in a variety of places. E.g., to split by any number of white space characters</p>
<pre><code class="r">line <- "a dog\tjumped\nover \tthe moon."
cat(line)
</code></pre>
<pre><code>## a dog jumped
## over the moon.
</code></pre>
<pre><code class="r">str_split(line, "[[:space:]]+")
</code></pre>
<pre><code>## [[1]]
## [1] "a" "dog" "jumped" "over" "the" "moon."
</code></pre>
<pre><code class="r">str_split(line, "[[:blank:]]+")
</code></pre>
<pre><code>## [[1]]
## [1] "a" "dog" "jumped\nover" "the"
## [5] "moon."
</code></pre>
<p>Using backslashes to 'escape' particular characters can be tricky. One rule of thumb is to just keep adding backslashes until you get what you want!</p>
<pre><code class="r">## last case here is literally a backslash and then 'n'
strings <- c("Hello", "Hello.", "Hello\nthere", "Hello\\nthere")
cat(strings, sep = "\n")
</code></pre>
<pre><code>## Hello
## Hello.
## Hello
## there
## Hello\nthere
</code></pre>
<pre><code class="r">str_detect(strings, ".") ## . means any character
</code></pre>
<pre><code>## [1] TRUE TRUE TRUE TRUE
</code></pre>
<pre><code class="r">## str_detect(strings, "\.") ## \. looks for the special symbol \.
str_detect(strings, "\\.") ## \\ says treat \ literally, which then escapes the .
</code></pre>
<pre><code>## [1] FALSE TRUE FALSE FALSE
</code></pre>
<pre><code class="r">str_detect(strings, "\n") ## \n looks for the special symbol \n
</code></pre>
<pre><code>## [1] FALSE FALSE TRUE FALSE
</code></pre>
<pre><code class="r">## str_detect(strings, "\\") ## \\ says treat \ literally, but \ is not meaningful regex
str_detect(strings, "\\\\") ## R parser removes two \ to give \\; then in regex \\ treats second \ literally
</code></pre>
<pre><code>## [1] FALSE FALSE FALSE TRUE
</code></pre>
<h2>3.4) Using regex in Python</h2>
<h3>3.4.1) Working with patterns</h3>
<p>For working with regex in Python, we'll need the <em>re</em> package. It provides Perl-style regular expressions, but it doesn't seem to support named character classes such as <code>[:digit:]</code>. Instead use classes such as <code>\d</code> and <code>[0-9]</code>.</p>
<p>Again, in the code chunks that follow, all the explicit print statements are needed for R Markdown to print out the values.</p>
<p>In Python, you apply a matching function and then query the result to get information about what was matched and where in the string. </p>
<pre><code class="python">import re
text = ["Here's my number: 919-543-3300.", "hi John, good to meet you",
"They bought 731 bananas", "Please call 919.554.3800"]
m = re.search("\d+", text[0])
print( m.group() )
</code></pre>
<pre><code>## 919
</code></pre>
<pre><code class="python">print( m.start() )
</code></pre>
<pre><code>## 18
</code></pre>
<pre><code class="python">print( m.end() )
</code></pre>
<pre><code>## 21
</code></pre>
<pre><code class="python">print( m.span() )
</code></pre>
<pre><code>## (18, 21)
</code></pre>
<pre><code class="python">import re
text = ["Here's my number: 919-543-3300.", "hi John, good to meet you",
"They bought 731 bananas", "Please call 919.554.3800"]
print( re.findall("\d+", text[0]) )
</code></pre>
<pre><code>## ['919', '543', '3300']
</code></pre>
<p>To ignore case, do the following:</p>
<pre><code class="python">import re
str = "That cat in the Hat"
print( re.findall("hat", str, re.IGNORECASE) )
</code></pre>
<pre><code>## ['hat', 'Hat']
</code></pre>
<p>We can of course use list comprehension to work with multiple strings. But we need to be careful to check whether a match was found.</p>
<pre><code class="python">import re
text = ["Here's my number: 919-543-3300.", "hi John, good to meet you",
"They bought 731 bananas", "Please call 919.554.3800"]
def return_group(pattern, txt):
m = re.search(pattern, txt)
if m:
return(m.group())
else:
return(None)
print( [return_group("\d+", str) for str in text] )
</code></pre>
<pre><code>## ['919', None, '731', '919']
</code></pre>
<p>Next, let's look at replacing patterns.</p>
<pre><code class="python">import re
text = ["Here's my number: 919-543-3300.", "hi John, good to meet you",
"They bought 731 bananas", "Please call 919.554.3800"]
print( re.sub("\d", "Z", text[0]) )
</code></pre>
<pre><code>## Here's my number: ZZZ-ZZZ-ZZZZ.
</code></pre>
<pre><code class="python">import re
text = '"H4NY07011","ACKERMAN, GARY L.","H","$13,242",,,'
print( re.sub("([^\",]),", "\\1", text) )
</code></pre>
<pre><code>## "H4NY07011","ACKERMAN GARY L.","H","$13242",,,
</code></pre>
<p>Finally, let's see the consequences of greedy matching and use of <code>?</code> to avoid greeding matching.</p>
<pre><code class="python">import re
text = "Do an internship <b> in place </b> of <b> one </b> course."
print( re.sub("<.*>", "", text) )
</code></pre>