Introduction to Biostatistical Computing PHC 6937

Advanced string manupulation

Zhiguang Huo (Caleb)

Wednesday Oct 19th, 2022

Outlines

Review of basic string operation

title = "I love introduction to Biostatistical computing!"
title.find("love")
## 2
title.find("XX")
## -1
title.index("love")
## 2
title.replace("love", "like")
## 'I like introduction to Biostatistical computing!'

Regular expression (regex)

import re ## import the python regular expression module

re.search / re.match

text = "Have a nice day!"
re.match("Have", text)
## <re.Match object; span=(0, 4), match='Have'>
amatch = re.match("Have", text)
if amatch:
    print("Wonderful!")
else:
    print("Oops")
## Wonderful!
amatch = re.match("Have", text)
print(amatch.span())
## (0, 4)
amatch = re.match("Have", text)
print(amatch.group())
## Have

split

text = "Alex works diligently. Alex gets good grades. Our student Alex is successful"

re.split("Alex", text) ## regular expression
## ['', ' works diligently. ', ' gets good grades. Our student ', ' is successful']
text.split("Alex") ## traditional method
## ['', ' works diligently. ', ' gets good grades. Our student ', ' is successful']

findall

text = "Alex works diligently. Alex gets good grades. Our student Alex is successful"

re.findall("Alex", text)
## ['Alex', 'Alex', 'Alex']
re.findall("student", text)
## ['student']
re.findall(" s", text)
## [' s', ' s']
re.findall("s ", text)
## ['s ', 's ', 's ']

Complex patterns

grades = "ACAAAABCBCBAA"
re.findall("A", grades)
## ['A', 'A', 'A', 'A', 'A', 'A', 'A']
re.findall("^A", grades)
## ['A']
re.findall("A$", grades)
## ['A']

Complex patterns

grades = "ACAAAABCBCBAA"
re.findall("[AB]", grades) ## A or B
## ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'A', 'A']
content = "I love biostatistical computing"
re.findall("[A-Z]", content) 
## ['I']
re.findall("[A-Z]", content.title()) 
## ['I', 'L', 'B', 'C']

Complex patterns

content = "I love biostatistical computing"
re.findall("[a-z]", content.title()) 
## ['o', 'v', 'e', 'i', 'o', 's', 't', 'a', 't', 'i', 's', 't', 'i', 'c', 'a', 'l', 'o', 'm', 'p', 'u', 't', 'i', 'n', 'g']
content = "A1B2C3D4"
re.findall("[0-9]", content) 
## ['1', '2', '3', '4']

Complex patterns

content = "A1B2C3 D4\n E5\t"
re.findall("\w", content) 
## ['A', '1', 'B', '2', 'C', '3', 'D', '4', 'E', '5']
re.findall("\d", content) 
## ['1', '2', '3', '4', '5']
re.findall("\s", content) 
## [' ', '\n', ' ', '\t']

Complex patterns

grades = "ACAAAABCBCBAA"
re.findall("A|B", grades) ## A or B
## ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'A', 'A']
grades = "ACAAAABCBCBAA"
re.findall("[A][B-C]", grades)
### [A][B-C] pattern denoted two sets of characters which must have been matched back to back.
## ['AC', 'AB']
grades = "ACAAAABCBCBAA"
re.findall("AB|AC", grades) ## AB or AC
## ['AC', 'AB']

Complex patterns: do not match [^]

grades = "ACAAAABCBCBAA"
re.findall("[^A]", grades) ## in the [], not A
## ['C', 'B', 'C', 'B', 'C', 'B']
grades = "ACAAAABCBCBAA"
re.findall("^[^A]", grades) ## ^ in the [], not A
## []
grades = "ACAAAABCBCBAA"
re.findall("^[^B]", grades) 
## ['A']

Match telephone numbers, in class practice

cell_numbers = "123 456-7890, 321-654-7890, (213)456-7980, (123)-456-0987, 1254367890, 123-4587690"
re.findall("\(\d{3}\)\d{3}-\d{4}", cell_numbers)
## ['(213)456-7980']
re.findall("\d{3}-\d{3}-\d{4}", cell_numbers)
## ['321-654-7890']

Quantifiers

grades = "ACAAAABCBCBAA"
re.findall("A{2,10}", grades) ## 2: minimum number of A; 10: maximum number of A
## ['AAAA', 'AA']
grades = "ACAAAABCBCBAA"
re.findall("A{1,1}A{1,1}", grades)
## ['AA', 'AA', 'AA']
grades = "ACAAAABCBCBAA"
re.findall("A{2, 2}", grades)
## []
grades = "ACAAAABCBCBAA"
re.findall("A{2}", grades)
## ['AA', 'AA', 'AA']
re.findall("A{1,10}B{1,10}C{1,10}", grades)
## ['AAAABC']

Make the maximum arbitrarily large

grades = "ACAAAABCBCBAA"
re.findall("A+B+C+", grades)
## ['AAAABC']
re.findall("A?B?C?", grades)
## ['AC', 'A', 'A', 'A', 'ABC', 'BC', 'B', 'A', 'A', '']
re.findall("A*B+", grades)
## ['AAAAB', 'B', 'B']

Remove extra whitespaes, in class exercise

text = " Alex    works diligently.  \n\n  Alex   gets   good grades. Our student   Alex is successful  \n"
re.sub("\s+", " ", text.strip())
## 'Alex works diligently. Alex gets good grades. Our student Alex is successful'

Examples

with open("ferpa.txt", "r") as file:
    wiki = file.read()

print(wiki)
## Overview[edit]
## FERPA gives parents access to their child's education records, an opportunity to seek to have the records amended, and some control over the disclosure of information from the records. With several exceptions, schools must have a student's consent prior to the disclosure of education records after that student is 18 years old. The law applies only to educational agencies and institutions that receive funds under a program administered by the U.S. Department of Education.
## 
## Other regulations under this act, effective starting January 3, 2012, allow for greater disclosures of personal and directory student identifying information and regulate student IDs and e-mail addresses.[2] For example, schools may provide external companies with a student's personally identifiable information without the student's consent.[2]
## 
## Examples of situations affected by FERPA include school employees divulging information to anyone other than the student about the student's grades or behavior, and school work posted on a bulletin board with a grade. Generally, schools must have written permission from the parent or eligible student in order to release any information from a student's education record.
## 
## This privacy policy also governs how state agencies transmit testing data to federal agencies, such as the Education Data Exchange Network.
## 
## This U.S. federal law also gave students 18 years of age or older, or students of any age if enrolled in any post-secondary educational institution, the right of privacy regarding grades, enrollment, and even billing information unless the school has specific permission from the student to share that specific type of information.
## 
## FERPA also permits a school to disclose personally identifiable information from education records of an "eligible student" (a student age 18 or older or enrolled in a postsecondary institution at any age) to his or her parents if the student is a "dependent student" as that term is defined in Section 152 of the Internal Revenue Code. Generally, if either parent has claimed the student as a dependent on the parent's most recent income tax statement, the school may non-consensually disclose the student's education records to both parents.[3]
## 
## The law allowed students who apply to an educational institution such as graduate school permission to view recommendations submitted by others as part of the application. However, on standard application forms, students are given the option to waive this right.
## 
## FERPA specifically excludes employees of an educational institution if they are not students.
## 
## The act is also referred to as the Buckley Amendment, for one of its proponents, Senator James L. Buckley of New York.
## 
## Access to public records[edit]
## The citing of FERPA to conceal public records that are not "educational" in nature has been widely criticized, including by the act's primary Senate sponsor.[4] For example, in the Owasso Independent School District v. Falvo case, an important part of the debate was determining the relationship between peer-grading and "education records" as defined in FERPA. In the Court of Appeals, it was ruled that students placing grades on the work of other students made such work into an "education record." Thus, peer-grading was determined as a violation of FERPA privacy policies because students had access to other students' academic performance without full consent.[5] However, when the case went to the Supreme Court, it was officially ruled that peer-grading was not a violation of FERPA. This is because a grade written on a student's work does not become an "education record" until the teacher writes the final grade into a grade book.[6]
## 
## Student medical records[edit]
## Legal experts have debated the issue of whether student medical records (for example records of therapy sessions with a therapist at an on-campus counseling center) might be released to the school administration under certain triggering events, such as when a student sued his college or university.[7][8]
## 
## Usually, student medical treatment records will remain under the protection of FERPA, not the Health Insurance Portability and Accountability Act (HIPAA). This is due to the "FERPA Exception" written within HIPAA.[9]
re.findall("[a-zA-Z]{1,100}\[edit\]", wiki)
## ['Overview[edit]', 'records[edit]', 'records[edit]']
re.findall("[\w]{1,100}\[edit\]", wiki)
## ['Overview[edit]', 'records[edit]', 'records[edit]']
re.findall("[\w]+\[edit\]", wiki)
## ['Overview[edit]', 'records[edit]', 'records[edit]']
re.findall("[\w ]*\[edit\]", wiki)
## ['Overview[edit]', 'Access to public records[edit]', 'Student medical records[edit]']
for title in re.findall("[\w ]*\[edit\]", wiki):
    print(re.split("[\[]", title)[0])
## Overview
## Access to public records
## Student medical records

Group

re.findall("([\w ]*)(\[edit\])", wiki)
## [('Overview', '[edit]'), ('Access to public records', '[edit]'), ('Student medical records', '[edit]')]
for title in re.findall("([\w ]*)(\[edit\])", wiki):
    print(title[0])
## Overview
## Access to public records
## Student medical records

iterator

mytuple = ("apple", "banana", "cherry")
myit = iter(mytuple)

print(next(myit))
## apple
print(next(myit))
## banana
print(next(myit))
## cherry
mytuple = ("apple", "banana", "cherry")
myit = iter(mytuple)

for i in myit:
  print(i)
## apple
## banana
## cherry

create a iterator for RE

for item in re.finditer("([\w ]*)(\[edit\])", wiki):
    print(item.groups())
## ('Overview', '[edit]')
## ('Access to public records', '[edit]')
## ('Student medical records', '[edit]')
item
## <re.Match object; span=(3692, 3721), match='Student medical records[edit]'>
item.span()
## (3692, 3721)
item.group()
## 'Student medical records[edit]'
item.group(0)
## 'Student medical records[edit]'
item.group(1)
## 'Student medical records'
item.group(2)
## '[edit]'

Add title to groups

for item in re.finditer("(?P<title>[\w ]*)(?P<edit_link>\[edit\])", wiki):
    print(item.groupdict()['title'])
## Overview
## Access to public records
## Student medical records
item.groupdict()
## {'title': 'Student medical records', 'edit_link': '[edit]'}

Look-ahead and look-behind

text = "Amy works diligently. Alex gets good grades. Our student Aaron is successful"
re.findall("(\w+ )(?=works)", text)
## ['Amy ']
m = re.search('(?<=abc)def', 'abcdef')
m.group()
## 'def'
m = re.search('(?=abc)def', 'abcdef')
m

Example, Twitter data

with open("nytimeshealth.txt", "r") as file:
    health=file.read()
print(health[:600])
## 548662191340421120|Sat Dec 27 02:10:34 +0000 2014|Risks in Using Social Media to Spot Signs of Mental Distress http://nyti.ms/1rqi9I1
## 548579831169163265|Fri Dec 26 20:43:18 +0000 2014|RT @paula_span: The most effective nationwide diabetes prevention program you've probably never heard of:  http://newoldage.blogs.nytimes.com/2014/12/26/diabetes-prevention-that-works/
## 548579045269852161|Fri Dec 26 20:40:11 +0000 2014|The New Old Age Blog: Diabetes Prevention That Works http://nyti.ms/1xm7fTi
## 548444679529041920|Fri Dec 26 11:46:15 +0000 2014|Well: Comfort Casseroles for Winter Dinners http://nyti
re.findall("@\w+", health)
## ['@paula_span', '@nytimes', '@cslnyt', '@CDCgov', '@nytDeniseGrady', '@nytDeniseGrady', '@celiadugger', '@sangerkatz', '@stavernise', '@KJDellAntonia', '@NYTMotherlode', '@BethBDaley', '@bencareynyt', '@bencareynyt', '@cslnyt', '@susandominus', '@paula_span', '@bstrauch', '@bencareynyt', '@UpshotNYT', '@deborahblum', '@cslnyt', '@acognews', '@Preeclampsia', '@MySMFM', '@randyNYT', '@paula_span', '@UpshotNYT', '@nycscribe', '@GardinerHarris', '@nytimesbooks', '@nytimesbooks', '@UpshotNYT', '@jswatz', '@AlexVadukul', '@michaelpollan', '@celiadugger', '@paula_span', '@bstrauch', '@ginakolata', '@cslnyt', '@stavernise', '@deborahblum', '@nytimes', '@nytimeswell', '@bstrauch', '@ginakolata', '@UpshotNYT', '@sangerkatz', '@palafo', '@nytimes', '@UpshotNYT', '@paula_span', '@bstrauch', '@bstrauch', '@GardinerHarris', '@cslnyt', '@cslnyt', '@UpshotNYT', '@paula_span', '@janegross', '@sangerkatz', '@amand', '@abbygoodnough', '@abbygoodnough', '@SominiSengupta', '@nycscribe', '@celiadugger', '@pfizer', '@gatesfoundation', '@usaid', '@EllenBarryNYT', '@paula_span', '@paula_span', '@camaldarelli', '@paula_span', '@katiehafner', '@AARP', '@camaldarelli', '@paula_span', '@ginakolata', '@HealthStudent', '@grynbaum', '@nytimeswell', '@paula_span', '@bstrauch', '@mattfleg', '@NYTMetro', '@BilldeBlasio', '@nytimeswell', '@nytimeswell', '@nytimeswell', '@NYTMetro', '@nytimes', '@NYTMetro', '@NYGovCuomo', '@GovChristie', '@NYTMetro', '@BilldeBlasio', '@joshhaner', '@NYTMetro', '@RamRajuMD', '@nytgraphics', '@NYTMetro', '@mattfleg', '@mattfleg', '@NYTMetro', '@DrMaryTBassett', '@peterbakernyt', '@nytDeniseGrady', '@grynbaum', '@FrancesRobles', '@DrMaryTBassett', '@carlzimmer', '@BilldeBlasio', '@grynbaum', '@BilldeBlasio', '@deBlasioNYC', '@deBlasioNYC', '@mattfleg', '@mattfleg', '@NYTMetro', '@SarahMaslinNir', '@deBlasioNYC', '@mattfleg', '@FrancesRobles', '@carlzimmer', '@NYTMetro', '@heathertal', '@nytDeniseGrady', '@nytDeniseGrady', '@susannecraig', '@DrMaryTBassett', '@thomaskaplan', '@cslnyt', '@DrMaryTBassett', '@thomaskaplan', '@thomaskaplan', '@grynbaum', '@BilldeBlasio', '@archietse', '@nytDeniseGrady', '@BilldeBlasio', '@NYGovCuomo', '@jswatz', '@lpolgreen', '@jswatz', '@denisegrady', '@tminsberg', '@jswatz', '@carlzimmer', '@nytimes', '@HirokoTabuchi', '@cslnyt', '@nytimesscience', '@Stitcher', '@SeniorsLaw', '@paula_span', '@TheNewOldAge', '@UpshotNYT', '@vikasbajaj', '@helenecooper', '@nytvideo', '@HannaIngber', '@ssstrom', '@motokorich', '@ssstrom', '@sangerkatz', '@aaronecarroll', '@lpolgreen', '@MonkovicNYT', '@bstrauch', '@jimdao', '@helenecooper', '@picardonhealth', '@aaronecarroll', '@nytimes', '@sangerkatz', '@helenecooper', '@helenecooper', '@mannyNYT', '@nytimes', '@mannyNYT', '@paula_span', '@Atul_Gawande', '@camaldarelli', '@CDCgov', '@CDCgov', '@carlzimmer', '@palafo', '@nytimes', '@DavidFurstNYT', '@meslackman', '@nycscribe', '@nytpolitics', '@cslnyt', '@nytDeniseGrady', '@bcsolomon', '@nytimes', '@rickgladstone', '@meslackman', '@celiadugger', '@ReedAbelson', '@stavernise', '@mannyNYT', '@celiadugger', '@bcsolomon', '@BK4theINYT', '@ksacknyt', '@ksacknyt', '@ksacknyt', '@liamstack', '@HannaIngber', '@CDCFound', '@AndrewScrivani', '@AndrewScrivani', '@AndrewScrivani', '@DavidFurstNYT', '@berehulak', '@ksacknyt', '@Jeffdelviscio', '@jimdao', '@ksacknyt', '@katie_thomas', '@bstrauch', '@jimdao', '@lpolgreen', '@rcallimachi', '@sherifink', '@nickconfessore', '@jimdao', '@judith_graham', '@Jeffdelviscio', '@MayBrittMoser', '@jimgorman', '@sangerkatz', '@katie_thomas', '@ksacknyt', '@cslnyt', '@MarthaRShulman', '@bcsolomon', '@nytimesscience', '@MayBrittMoser', '@jimdao', '@NobelPrize', '@damiencave', '@helenecooper', '@bstrauch', '@ksacknyt', '@bcsolomon', '@bstrauch', '@NIAIDNews', '@jeromedelay', '@lpolgreen', '@paula_span', '@abbygoodnough', '@bstrauch', '@celiadugger', '@celiadugger', '@nytimeswell', '@sangerkatz', '@judith_graham', '@heathertal', '@nytDeniseGrady', '@PamBelluck', '@bstrauch', '@nycscribe', '@CDCgov', '@DrFriedenCDC', '@nytDeniseGrady', '@carlzimmer', '@DrFri', '@sangerkatz', '@roseperson', '@martinenserink', '@lpolgreen', '@bcsolomon', '@bstrauch', '@bcsolomon', '@MSF', '@bcsolomon', '@MSF', '@paula_span', '@carlzimmer', '@nytimeswell', '@dacorc', '@yoshaylaharris', '@dacorc', '@carlzimmer', '@JustinHGillis', '@NYTmag', '@Samuel_Aranda', '@berehulak', '@heathertal', '@berehulak', '@celiadugger', '@paula_span', '@kchangnyt', '@MarthaRShulman', '@sangerkatz', '@nytimes', '@ReedAbelson', '@celiadugger', '@cslnyt', '@natashanyt', '@cslnyt', '@kchangnyt', '@tminsberg', '@PamBelluck', '@celiadugger', '@bcsolomon', '@MSF', '@bcsolomon', '@MSF', '@nytvideo', '@sangerkatz', '@JeffGordinier', '@GretchenReynold', '@nytimeswell', '@bstrauch', '@nytopinion', '@paula_span', '@judith_graham', '@bcsolomon', '@nytimes', '@lpolgreen', '@bcsolomon', '@MarthaRShulman', '@Sulliview', '@gatesfoundation', '@michaelluo', '@bstrauch', '@MarthaRShulman', '@cslnyt', '@anemonanyc', '@j', '@ReedAbelson', '@UpshotNYT', '@bcsolomon', '@MSF', '@cslnyt', '@bencareynyt', '@bencareynyt', '@paula_span', '@ginakolata', '@nytimeswell', '@carlzimmer', '@heathertal', '@anahadoconnor', '@anahadoconnor', '@nycscribe', '@carlzimmer', '@cslnyt', '@FordhamLawNYC', '@nytDeniseGrady', '@DanBilefsky', '@cslnyt', '@iglovett', '@ReedAbelson', '@bstrauch', '@jimdao', '@anahadoconnor', '@cslnyt', '@bstrauch', '@cslnyt', '@AmerAcadPeds', '@cslnyt', '@jimdao', '@paula_span', '@PamBelluck', '@nytopinion', '@heathertal', '@bstrauch', '@nytDeniseGrady', '@carlzimmer', '@nytimes', '@Jeffdelviscio', '@jimdao', '@nytopinion', '@dacorc', '@nytDeniseGrady', '@ABC', '@nytDeniseGrady', '@WSJ', '@nytDeniseGrady', '@cslnyt', '@katie_thomas', '@nytDeniseGrady', '@nytDeniseGrady', '@sangerkatz', '@aaronecarroll', '@bstrauch', '@nytrosenthal', '@MattWaldNYT', '@bstrauch', '@bstrauch', '@paula_span', '@WilliamJBroad', '@UpshotNYT', '@cslnyt', '@sangerkatz', '@DrPaulOffit', '@cslnyt', '@kchangnyt', '@sangerkatz', '@ProfEmilyOster', '@celiadugger', '@cslnyt', '@nytDeniseGrady', '@emoryhealthcare', '@_nicolaclark', '@rcallimachi', '@UpshotNYT', '@celiadugger', '@nytDeniseGrady', '@tminsberg', '@nytopinion', '@paula_span', '@celiadugger', '@cslnyt', '@ksacknyt', '@nytimes', '@sangerkatz', '@ReedAbelson', '@EricLichtblau', '@ksacknyt', '@nytimes', '@jimgorman', '@cslnyt', '@RebeccaRotert', '@paula_span', '@NYTmag', '@celiadugger', '@sangerkatz', '@CDCgov', '@paula_span', '@celiadugger', '@nytopinion', '@celiadugger', '@bcsolomon', '@reddit', '@nytDeniseGrady', '@cslnyt', '@nytDeniseGrady', '@celiadugger', '@nytDeniseGrady', '@ginakolata', '@sbkaufman', '@jonathanweisman', '@paula_span', '@PamBelluck', '@bcsolomon', '@jswatz', '@nytrosenthal', '@nytimeswell', '@rickgladstone', '@UNICEF', '@nytimesscience', '@carlzimmer', '@NYTNow', '@sangerkatz', '@UpshotNYT', '@celiadugger', '@Jeffdelviscio', '@nytDeniseGrady', '@sherifink', '@jimdao', '@lpolgreen', '@kerrimac', '@celiadugger', '@cslnyt', '@nytDeniseGrady', '@Jeffdelviscio', '@nytimes', '@bcsolomon', '@lpolgreen', '@nytDeniseGrady', '@ReedAbelson', '@RachelAbramsNY', '@bcsolomon', '@bcsolomon', '@nytimesscience', '@fedira', '@WHO', '@jimdao', '@nytDeniseGrady', '@celiadugger', '@carlzimmer', '@nytimes', '@bcsolomon', '@heathertal', '@bcsolomon', '@nycscribe', '@celiadugger', '@nickconfessore', '@deanbaquet', '@davjolly', '@nytDeniseGrady', '@cslnyt', '@DrFriedenCDC', '@sangerkatz', '@jswatz', '@eckholm', '@celiadugger', '@jimdao', '@cslnyt', '@UpshotNYT', '@nytDeniseGrady', '@GretchenReynold', '@rickgladstone', '@stavernise', '@UpshotNYT', '@MonkovicNYT', '@ReedAbelson', '@UpshotNYT', '@sangerkatz', '@ReedAbelson', '@nytrosenthal', '@paula_span', '@PamBelluck', '@charlesornstein', '@celiadugger', '@nytDeniseGrady', '@nytDeniseGrady', '@MarthaRShulman', '@_Waterwhispers_', '@GentleWhisperin', '@nytDeniseGrady', '@nytDeniseGrady', '@MarthaRShulman', '@nytDeniseGrady', '@nytDeniseGrady', '@StateDept', '@cslnyt', '@anemonanyc', '@paula_span', '@cslnyt', '@anemonanyc', '@celiadugger', '@paula_span', '@judith_graham', '@celiadugger', '@deborahblum', '@theleechguy', '@nytimeshealth', '@celiadugger', '@blaineharden', '@deborahblum', '@paula_span', '@nytopinion', '@cslnyt', '@celiadugger', '@paula_span', '@NitaLowey', '@paula_span', '@MartinSmithEdit', '@damiencave', '@sherylstolberg', '@celiadugger', '@abbygoodnough', '@jtaylornyt', '@celiadugger', '@nytdenisegrady', '@GretchenReynold', '@MarthaRShulman', '@JanHoffmanNYT', '@MarthaRShulman', '@nytimesscience', '@palafo', '@cslnyt', '@alanschwarz', '@jtaylornyt', '@sargoll', '@liamstack', '@Jeffdelviscio', '@croakeyblog', '@jo_m_chandler', '@DoctorYasmin', '@DoctorYasmin', '@heathertal', '@paula_span', '@deborahblum', '@nytimeshealth', '@MarthaRShulman', '@UpshotNYT', '@celiadugger', '@David_Dobbs', '@albertsun', '@paula_span', '@nytimeswell', '@cslnyt', '@greenhousenyt', '@bydanielvictor', '@cslnyt', '@GardinerHarris', '@markoff', '@nytimes', '@GardinerHarris', '@lpolgreen', '@gardinerharris', '@lpolgreen', '@paula_span', '@celiadugger', '@cslnyt', '@Suzannedaley', '@deborahblum', '@suleikajaouad', '@nytimes', '@celiadugger', '@nytopinion', '@paula_span', '@nytopinion', '@celiadugger', '@dacorc', '@JanHoffmanNYT', '@nytdining', '@Bittman', '@esung', '@GabrielleGlaser', '@MarthaRShulman', '@tamarnyt', '@celiadugger', '@nytrosenthal', '@nytrosenthal', '@nytrosenthal', '@cslnyt', '@gabrielleglaser', '@paula_span', '@heathertal', '@nytrosenthal', '@nytrosenthal', '@nytrosenthal', '@celiadugger', '@ReedAbelson', '@BarryMeier', '@PamBelluck', '@cslnyt', '@aaronecarroll', '@paula_span', '@nytimes', '@nytDeniseGrady', '@deborahblum', '@nytimes', '@nytvideo', '@cslnyt', '@adamliptak', '@dacorc', '@Angier58', '@aaroncarroll', '@UpshotNYT', '@jessbidgood', '@PPact', '@celiadugger', '@PamBelluck', '@eckholm', '@WilliamJBroad', '@jswatz', '@cslnyt', '@David_Dobbs', '@nytimes', '@NYTmag', '@anahadoconnor', '@cslnyt', '@michaelroston', '@WilliamJBroad', '@PamBelluck', '@PamBelluck', '@wnyc', '@LeonardLopate', '@WilliamJBroad', '@paula_span', '@anahadoconnor', '@JanHoffmanNYT', '@grynbaum', '@JulietMacur', '@celiadugger', '@PatcohenNYT', '@hellofromcath', '@paula_span', '@cslnyt', '@TheNewOldAge', '@ginakolata', '@WilliamJBroad', '@cslnyt', '@cslnyt', '@jessemckinley', '@PamBelluck', '@celiadugger', '@celiadugger', '@JessicaZitter', '@halbfinger', '@jessemckinley', '@albertsun', '@mollywood', '@celiadugger', '@ronicaryn', '@anahadoconnor', '@paula_span', '@PamBelluck', '@NYTMotherlode', '@celiadugger', '@SophieEganNYT', '@nytimeswell', '@anahadoconnor', '@SophieEganNYT', '@PamBelluck', '@cslnyt', '@celiadugger', '@realjohngreen', '@ReedAbelson', '@paula_span', '@paula_span', '@celiadugger', '@cslnyt', '@anahadoconnor', '@cslnyt', '@paula_span', '@celiadugger', '@ReedAbelson', '@nytDeniseGrady', '@paula_span', '@janegross', '@nytDeniseGrady', '@paula_span', '@nytDeniseGrady', '@celiadugger', '@nytimes', '@celiadugger', '@celiadugger', '@cslnyt', '@ReedAbelson', '@celiadugger', '@paula_span', '@PerriKlass', '@ReedAbelson', '@celiadugger', '@paula_span', '@DoctorpaulMD', '@nytDeniseGrady', '@cslnyt', '@jimgorman', '@paula_span', '@celiadugger', '@KarenBBarrow', '@nytimes', '@celiadugger', '@bencareynyt', '@celiadugger', '@cslnyt', '@nytdenisegra', '@cslnyt', '@paula_span', '@celiadugger', '@cslnyt', '@NIH', '@ronicaryn', '@celiadugger', '@anahadoconnor', '@cslnyt', '@anahadoconnor', '@paula_span', '@celiadugger', '@celiadugger', '@nytDeniseGrady', '@celiadugger', '@cslnyt', '@UMNews', '@GretchenReynold', '@cslnyt', '@GretchenReynold', '@albertsun', '@paula_span', '@cslnyt', '@ronicaryn', '@cslnyt', '@stavernise', '@celiadugger', '@cslnyt', '@celiadugger', '@cslnyt', '@WHO', '@cslnyt', '@paula_span', '@amy_harmon', '@cslnyt', '@RenskevanWijk', '@celiadugger', '@taraparkerpope', '@anahadoconnor', '@celiadugger', '@celiadugger', '@cslnyt', '@celiadugger', '@cslnyt', '@nytrosenthal', '@celiadugger', '@paula_span', '@cslnyt', '@celiadugger', '@celiadugger', '@cslnyt', '@taraparkerpope', '@paula_span', '@cslnyt', '@nytd', '@paula_span', '@nytDeniseGrady', '@nytDeniseGrady', '@cslnyt', '@TheLancet', '@cslnyt', '@cslnyt', '@bencareynyt', '@paula_span', '@paula_span', '@paula_span', '@paula_span', '@cslnyt', '@GretchenReynold', '@cslnyt', '@sarahcnyt', '@cslnyt', '@celiadugger', '@paula_span', '@NIH', '@nytDeniseGrady', '@cslnyt', '@cslnyt', '@cslnyt', '@stavernise', '@paula_span', '@cslnyt', '@ronicaryn', '@UPenn', '@Princeton', '@ronicaryn', '@nytimes', '@kesselheim', '@celiadugger', '@cslnyt', '@AmerAcadPeds', '@TobaccoFreeKids', '@cslnyt', '@abbyellin', '@paula_span', '@paula_span', '@cslnyt', '@stavernise', '@anahadoconnor', '@paula_span', '@cslnyt', '@paula_span', '@susanjaffe', '@cslnyt', '@cslnyt', '@nytDeniseGrady', '@celiadugger', '@cslnyt', '@cslnyt', '@mrichtel', '@paula_span', '@paula_span', '@celiadugger', '@cslnyt', '@ronicaryn', '@cslnyt', '@PamBelluck', '@GretchenReynold', '@ronicaryn', '@celiadugger', '@paula_span', '@cslnyt', '@JanHoffmanNYT', '@StevenPetrow', '@paula_span', '@paulinechen', '@nytimeswell', '@SophieEganNYT', '@nytimeswell', '@GretchenReynold', '@albertsun', '@paula_span', '@paula_span', '@cslnyt', '@amy_harmon', '@KassieBracken', '@KassieBracken', '@amy_harmon', '@cslnyt', '@GretchenReynold', '@albertsun', '@cslnyt', '@anahadoconnor', '@celiadugger', '@celiadugger', '@paula_span', '@celiadugger', '@celiadugger', '@paula_span', '@cslnyt', '@cslnyt', '@ginakolata', '@cslnyt', '@cslnyt', '@paula_span', '@celiadugger', '@celiadugger', '@celiadugger', '@nytDeniseGrady', '@celiadugger', '@paula_span', '@cslnyt', '@stavernise', '@celiadugger', '@celiadugger', '@paula_span', '@celiadugger', '@cslnyt', '@nejm', '@cslnyt', '@paula_span', '@cslnyt', '@Jeffdelviscio', '@nytDeniseGrady', '@nytDeniseGrady', '@cslnyt', '@ASPCA', '@cslnyt', '@jshenkin', '@ADANews', '@celiadugger', '@celiadugger', '@cslnyt', '@ADANews', '@celiadugger', '@nytDeniseGrady', '@nytDeniseGrady', '@paula_span', '@nytDeniseGrady', '@paula_span', '@pewresearch', '@nytDeniseGrady', '@celiadugger', '@celiadugger', '@nytDeniseGrady', '@ginakolata', '@paula_span', '@celiadugger', '@nytDeniseGrady', '@paula_span', '@nytimes', '@cslnyt', '@celiadugger', '@celiadugger', '@nytDeniseGrady', '@cslnyt', '@bencareynyt', '@nytDeniseGrady', '@PamBelluck', '@cslnyt', '@anahadoconnor', '@celiadugger', '@celiadugger', '@cslnyt', '@newoldage', '@cslnyt', '@cslnyt', '@nytdining', '@paula_span', '@paula_span', '@PamBelluck', '@celiadugger', '@celiadugger', '@PerriKlass', '@nytDeniseGrady', '@paula_span', '@judith_graham', '@anahadoconnor', '@anahadoconnor', '@taraparkerpope', '@celiadugger', '@cslnyt', '@taraparkerpo', '@ginakolata', '@cslnyt', '@anahadoconnor', '@celiadugger', '@paula_span', '@paula_span', '@cslnyt', '@celiadugger', '@celiadugger', '@anahadoconnor', '@cslnyt', '@stavernise', '@anahadoconnor', '@cslnyt', '@cslnyt', '@nytdenisegrady', '@cslnyt', '@car', '@celiadugger', '@Jeffdelviscio', '@carlzimmer', '@celiadugger', '@JaneBrody', '@celiadugger', '@cslnyt', '@tirosenberg', '@GretchenReynold', '@celiadugger', '@cslnyt', '@nytimeswell', '@celiadugger', '@cslnyt', '@ronicaryn', '@celiadugger', '@celiadugger', '@PamBelluck', '@cslnyt', '@nytrosenthal', '@paula_span', '@ginakolata', '@DQuenqua', '@celiadugger', '@paula_span', '@nytimeswell', '@celiadugger', '@ronicaryn', '@celiadugger', '@cslnyt', '@paula_span', '@cslnyt', '@paula_span', '@celiadugger', '@celiadugger', '@celiadugger', '@cslnyt', '@PamBelluck', '@anahadoconnor', '@cslnyt', '@bencareynyt', '@cslnyt', '@celiadugger', '@judith_graham', '@paula_span', '@celiadugger', '@celiadugger', '@paula_span', '@judith_graham', '@cslnyt', '@celiadugger', '@paula_span', '@PamBelluck', '@paula_span', '@paula_span', '@cslnyt', '@celiadugger', '@celiadugger', '@cslnyt', '@cslnyt', '@PamBelluck', '@celiadugger', '@paula_span', '@janegross', '@nytDeniseGrady', '@nytDeniseGrady', '@cslnyt', '@cslnyt', '@abbygoodnough', '@cslnyt', '@celiadugger', '@celiadugger', '@celiadugger', '@paula_span', '@celiadugger', '@paula_span', '@stavernise', '@nytimes', '@verizon', '@nytimeswell', '@verizon', '@celiadugger', '@celiadugger', '@paula_span', '@verizon', '@celiadugger', '@BarryMeier', '@celiadugger', '@ginakolata', '@nytimes', '@dacorc', '@nytDeniseGrady', '@nytimes', '@dacorc', '@celiadugger', '@cslnyt', '@ginakolata', '@cslnyt', '@KJDellAntonia', '@celiadugger', '@cslnyt', '@abbygoodnough', '@paula_span', '@nytimeswell', '@cslnyt', '@nytDeniseGrady', '@GretchenReynold', '@celiadugger', '@paula_span', '@magiorNYT', '@nytdenisegrady', '@celiadugger', '@nytimes', '@suleikajaouad', '@ginakolata', '@cslnyt', '@ginakolata', '@paula_span', '@paula_span', '@paula_span', '@susanjaffe', '@paula_span', '@paula_span', '@cslnyt', '@cslnyt', '@celiadugger', '@suleikajaouad', '@PamBelluck', '@PamBelluck', '@nytimeswell', '@nytimeswell', '@nytimeswell', '@newsemmys', '@ApothecaCanada', '@nytimeshealth', '@bstrauch', '@SophieEganNYT', '@cslnyt', '@cslnyt', '@paula_span', '@cslnyt', '@nytrosenthal', '@paula_span', '@PamBelluck', '@paula_span', '@PerriKlass', '@taraparkerpope', '@PerriKlass', '@cslnyt', '@bstrauch', '@GretchenReynold', '@Got_Next', '@nytimes', '@nyvinnie', '@nytimes', '@bstrauch', '@nytimes', '@brewcitypaul', '@nytimeswell', '@KarenBBarrow', '@nytimeswell', '@paula_span', '@bstrauch', '@nytimeswell', '@cslnyt', '@AAEnews', '@ADANews', '@Smile4Health', '@nytimeswell', '@cslnyt', '@nytimeswell', '@bstrauch', '@carlzimmer', '@paula_span', '@bstrauch', '@gardinerharris', '@bstrauch', '@cslnyt', '@bstrauch', '@bstrauch', '@gretchenreynolds', '@suleikajaouad', '@cslnyt', '@GretchenReynold', '@paula_span', '@cslnyt', '@ADHADOTORG', '@theNCI', '@UTHealth', '@PerriKlass', '@paula_span', '@PerriKlass', '@cslnyt', '@ProfEmilyOster', '@nytimesscience', '@cslnyt', '@anahadoconnor', '@bstrauch', '@bstrauch', '@henryfountain', '@cslnyt', '@patinamiller', '@PippinMusical', '@deborahblum', '@patinamiller', '@mfisherfitness', '@paula_span', '@danielleofri', '@paula_span', '@cslnyt', '@paula_span', '@SophieEganNYT', '@bstrauch', '@bstrauch', '@joyce_wadler', '@cslnyt', '@anahadoconnor', '@Dermdoc', '@nytimeshealth', '@paula_span', '@bstrauch', '@cslnyt', '@bstrauch', '@jmgorman', '@cslnyt', '@stavernise', '@JaneBrody', '@cslnyt', '@paula_span', '@paula_span', '@cslnyt', '@cslnyt', '@SophieEganNYT', '@cslnyt', '@StevenPetrow', '@nytimeswell', '@cslnyt', '@taraparkerpope', '@celiadugger', '@stavernise', '@cslnyt', '@HannaIngber', '@nytimeshealth', '@cslnyt', '@PamelaPaulNYT', '@KarenBBarrow', '@cslnyt', '@taraparker', '@paula_span', '@cslnyt', '@celiadugger', '@celiadugger', '@danielleofri', '@SophieEganNYT', '@stavernise', '@RoniCaryn', '@ronicaryn', '@paula_span', '@PamBelluck', '@GretchenReynold', '@RoniCaryn', '@paula_span', '@ronicaryn', '@anahadoconnor', '@ginakolata', '@paula_span', '@SophieEganNYT', '@emilymbadger', '@MarthaRShulman', '@JaneBrody', '@nytimeswell', '@ginakolata', '@celiadugger', '@DiscoverMag', '@paula_span', '@judith_graham', '@bstrauch', '@ginakolata', '@ginakolata', '@bstrauch', '@ginakolata', '@paula_span', '@JaneBrody', '@cslnyt', '@anahadoconnor', '@cslnyt', '@ShirleySWangWSJ', '@stavernise', '@cslnyt', '@nytimeswell', '@nytimeswell', '@JaneBrody', '@cslnyt', '@BarryMeier', '@taraparkerpope', '@suleikajaouad', '@NatGeo', '@bstrauch', '@GretchenReynolds', '@Pogue', '@JaneBrody', '@DQuenqua', '@PamBelluck', '@PamBelluck', '@celiadugger', '@MarthaRShulman', '@cslnyt', '@TorontoStar', '@nytimesscience', '@andrewscrivani', '@WSJ', '@WSJ', '@KomenfortheCure', '@bstrauch', '@celiadugger', '@cslnyt', '@ginakolata', '@dawnlerman', '@GretchenReynold', '@BarryMeier', '@ginakolata', '@PerriKlass', '@cslnyt', '@celiadugger', '@ginakolata', '@nytimeswell', '@nytimeswell', '@nytimeswell', '@nyti', '@ginakolata', '@KarenBBarrow', '@nytimeswell', '@BarryMeier', '@nytimeswell', '@RobertSturman', '@celiadugger', '@suleikajaouad', '@yoshaylaharris', '@celiadugger', '@cslnyt', '@JanHoffmanNYT', '@todayshow', '@cslnyt', '@brianfidelman', '@beccanalia', '@celiadugger', '@GretchenReynold', '@celiadugger', '@ronicaryn', '@cslnyt', '@AADskin', '@KarenBBarrow', '@anahadoconnor', '@suleikajaouad', '@byJenAMiller', '@nytDeniseGrady', '@taraparkerpope', '@taraparkerpope', '@zabmaria', '@amy_harmon', '@MarthaRShulman', '@cslnyt', '@cslnyt', '@nytimeswell', '@GretchenReynold', '@GretchenReynold', '@brianfidelman', '@NYBG', '@PamBelluck', '@UNCDentistry', '@nytimeshealth', '@paulinechen', '@cslnyt', '@cslnyt', '@taraparkerpope', '@KarenBBarrow', '@suleikajaouad', '@nytimeswell', '@GretchenReynold', '@PamBelluck', '@nytimesscience', '@cslnyt', '@henryfountain', '@GretchenReynold', '@MikkaelSekeres', '@GretchenReynold', '@ginakolata', '@GretchenReynold', '@GretchenReynold', '@nytimesscience', '@nytimes', '@KarenBBarrow', '@taraparkerpope', '@suleikajaouad', '@suleikajaouad', '@taraparkerpope', '@PaulineChen', '@GretchenReynold', '@DiscoverMag', '@nytDeniseGrady', '@nytimes', '@stavernise', '@GretchenReynold', '@nytimes', '@GretchenReynold', '@perriklass', '@PamBelluck', '@PamBelluck', '@cslnyt', '@PamBelluck', '@NY4P', '@highlinenyc', '@nytimeswell', '@brianfidelman', '@GretchenReynold', '@dacorc', '@mary_roach', '@mary_roach', '@TheDailyShow', '@nytimesvideo']
# multiple experiments
# re.findall("(\w+ \w+ \d+)(.*@\w+)", health)
# re.findall("(\w+ \w+ \d+)(?=.*@\w+)", health)
# re.findall("(\w{3} \w{3} \d+)(?=.*@\w+)", health)
re.findall("(?<=\|)(\w{3} \w{3} \d+)(?=.*@\w+)", health)
## ['Fri Dec 26', 'Wed Dec 24', 'Wed Dec 24', 'Tue Dec 23', 'Mon Dec 22', 'Tue Dec 16', 'Mon Dec 15', 'Thu Dec 11', 'Thu Dec 11', 'Thu Dec 11', 'Thu Dec 11', 'Wed Dec 10', 'Wed Dec 10', 'Tue Dec 09', 'Tue Dec 09', 'Tue Dec 09', 'Mon Dec 08', 'Thu Dec 04', 'Thu Dec 04', 'Thu Dec 04', 'Tue Dec 02', 'Tue Dec 02', 'Mon Dec 01', 'Mon Dec 01', 'Mon Dec 01', 'Fri Nov 28', 'Wed Nov 26', 'Wed Nov 26', 'Wed Nov 26', 'Wed Nov 26', 'Tue Nov 25', 'Mon Nov 24', 'Mon Nov 24', 'Mon Nov 24', 'Fri Nov 21', 'Fri Nov 21', 'Thu Nov 20', 'Thu Nov 20', 'Thu Nov 20', 'Wed Nov 19', 'Wed Nov 19', 'Tue Nov 18', 'Tue Nov 18', 'Tue Nov 18', 'Mon Nov 17', 'Mon Nov 17', 'Mon Nov 17', 'Mon Nov 17', 'Mon Nov 17', 'Fri Nov 14', 'Fri Nov 14', 'Tue Nov 11', 'Fri Nov 07', 'Fri Nov 07', 'Thu Nov 06', 'Tue Nov 04', 'Fri Oct 31', 'Thu Oct 30', 'Thu Oct 30', 'Wed Oct 29', 'Mon Oct 27', 'Mon Oct 27', 'Mon Oct 27', 'Mon Oct 27', 'Mon Oct 27', 'Mon Oct 27', 'Mon Oct 27', 'Sun Oct 26', 'Sat Oct 25', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Fri Oct 24', 'Thu Oct 23', 'Wed Oct 22', 'Wed Oct 22', 'Wed Oct 22', 'Wed Oct 22', 'Wed Oct 22', 'Wed Oct 22', 'Wed Oct 22', 'Tue Oct 21', 'Tue Oct 21', 'Tue Oct 21', 'Mon Oct 20', 'Mon Oct 20', 'Mon Oct 20', 'Mon Oct 20', 'Mon Oct 20', 'Mon Oct 20', 'Mon Oct 20', 'Mon Oct 20', 'Mon Oct 20', 'Mon Oct 20', 'Mon Oct 20', 'Sat Oct 18', 'Fri Oct 17', 'Fri Oct 17', 'Fri Oct 17', 'Fri Oct 17', 'Fri Oct 17', 'Fri Oct 17', 'Fri Oct 17', 'Fri Oct 17', 'Fri Oct 17', 'Thu Oct 16', 'Thu Oct 16', 'Thu Oct 16', 'Thu Oct 16', 'Thu Oct 16', 'Thu Oct 16', 'Thu Oct 16', 'Thu Oct 16', 'Thu Oct 16', 'Thu Oct 16', 'Wed Oct 15', 'Wed Oct 15', 'Wed Oct 15', 'Wed Oct 15', 'Wed Oct 15', 'Tue Oct 14', 'Tue Oct 14', 'Sun Oct 12', 'Sat Oct 11', 'Fri Oct 10', 'Wed Oct 08', 'Wed Oct 08', 'Wed Oct 08', 'Wed Oct 08', 'Wed Oct 08', 'Wed Oct 08', 'Wed Oct 08', 'Wed Oct 08', 'Wed Oct 08', 'Wed Oct 08', 'Tue Oct 07', 'Tue Oct 07', 'Tue Oct 07', 'Tue Oct 07', 'Mon Oct 06', 'Mon Oct 06', 'Mon Oct 06', 'Mon Oct 06', 'Mon Oct 06', 'Mon Oct 06', 'Mon Oct 06', 'Mon Oct 06', 'Mon Oct 06', 'Mon Oct 06', 'Fri Oct 03', 'Fri Oct 03', 'Fri Oct 03', 'Fri Oct 03', 'Fri Oct 03', 'Fri Oct 03', 'Fri Oct 03', 'Fri Oct 03', 'Fri Oct 03', 'Thu Oct 02', 'Thu Oct 02', 'Thu Oct 02', 'Thu Oct 02', 'Thu Oct 02', 'Wed Oct 01', 'Wed Oct 01', 'Wed Oct 01', 'Tue Sep 30', 'Tue Sep 30', 'Tue Sep 30', 'Tue Sep 30', 'Tue Sep 30', 'Tue Sep 30', 'Tue Sep 30', 'Sun Sep 28', 'Fri Sep 26', 'Fri Sep 26', 'Fri Sep 26', 'Thu Sep 25', 'Thu Sep 25', 'Thu Sep 25', 'Wed Sep 24', 'Tue Sep 23', 'Mon Sep 22', 'Mon Sep 22', 'Fri Sep 19', 'Fri Sep 19', 'Fri Sep 19', 'Fri Sep 19', 'Fri Sep 19', 'Fri Sep 19', 'Thu Sep 18', 'Thu Sep 18', 'Thu Sep 18', 'Wed Sep 17', 'Wed Sep 17', 'Wed Sep 17', 'Wed Sep 17', 'Wed Sep 17', 'Wed Sep 17', 'Wed Sep 17', 'Wed Sep 17', 'Tue Sep 16', 'Tue Sep 16', 'Tue Sep 16', 'Tue Sep 16', 'Tue Sep 16', 'Fri Sep 12', 'Thu Sep 11', 'Thu Sep 11', 'Thu Sep 11', 'Thu Sep 11', 'Thu Sep 11', 'Thu Sep 11', 'Wed Sep 10', 'Wed Sep 10', 'Wed Sep 10', 'Wed Sep 10', 'Wed Sep 10', 'Tue Sep 09', 'Tue Sep 09', 'Tue Sep 09', 'Tue Sep 09', 'Tue Sep 09', 'Tue Sep 09', 'Mon Sep 08', 'Fri Sep 05', 'Fri Sep 05', 'Fri Sep 05', 'Fri Sep 05', 'Thu Sep 04', 'Thu Sep 04', 'Thu Sep 04', 'Wed Sep 03', 'Wed Sep 03', 'Wed Sep 03', 'Wed Sep 03', 'Wed Sep 03', 'Tue Sep 02', 'Tue Sep 02', 'Tue Sep 02', 'Tue Sep 02', 'Tue Sep 02', 'Tue Sep 02', 'Mon Sep 01', 'Fri Aug 29', 'Thu Aug 28', 'Thu Aug 28', 'Thu Aug 28', 'Thu Aug 28', 'Thu Aug 28', 'Thu Aug 28', 'Thu Aug 28', 'Wed Aug 27', 'Wed Aug 27', 'Wed Aug 27', 'Wed Aug 27', 'Tue Aug 26', 'Mon Aug 25', 'Mon Aug 25', 'Mon Aug 25', 'Sun Aug 24', 'Sat Aug 23', 'Sat Aug 23', 'Fri Aug 22', 'Fri Aug 22', 'Thu Aug 21', 'Thu Aug 21', 'Thu Aug 21', 'Thu Aug 21', 'Wed Aug 20', 'Tue Aug 19', 'Tue Aug 19', 'Tue Aug 19', 'Tue Aug 19', 'Mon Aug 18', 'Mon Aug 18', 'Mon Aug 18', 'Mon Aug 18', 'Mon Aug 18', 'Mon Aug 18', 'Sun Aug 17', 'Sat Aug 16', 'Sat Aug 16', 'Sat Aug 16', 'Fri Aug 15', 'Fri Aug 15', 'Fri Aug 15', 'Fri Aug 15', 'Fri Aug 15', 'Thu Aug 14', 'Thu Aug 14', 'Thu Aug 14', 'Thu Aug 14', 'Thu Aug 14', 'Thu Aug 14', 'Wed Aug 13', 'Wed Aug 13', 'Tue Aug 12', 'Tue Aug 12', 'Tue Aug 12', 'Tue Aug 12', 'Tue Aug 12', 'Tue Aug 12', 'Tue Aug 12', 'Tue Aug 12', 'Tue Aug 12', 'Tue Aug 12', 'Tue Aug 12', 'Tue Aug 12', 'Tue Aug 12', 'Mon Aug 11', 'Mon Aug 11', 'Mon Aug 11', 'Mon Aug 11', 'Mon Aug 11', 'Mon Aug 11', 'Mon Aug 11', 'Sun Aug 10', 'Sun Aug 10', 'Sun Aug 10', 'Sat Aug 09', 'Sat Aug 09', 'Sat Aug 09', 'Sat Aug 09', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Fri Aug 08', 'Thu Aug 07', 'Thu Aug 07', 'Thu Aug 07', 'Thu Aug 07', 'Thu Aug 07', 'Thu Aug 07', 'Thu Aug 07', 'Thu Aug 07', 'Thu Aug 07', 'Thu Aug 07', 'Thu Aug 07', 'Thu Aug 07', 'Wed Aug 06', 'Wed Aug 06', 'Wed Aug 06', 'Wed Aug 06', 'Wed Aug 06', 'Tue Aug 05', 'Tue Aug 05', 'Tue Aug 05', 'Tue Aug 05', 'Mon Aug 04', 'Mon Aug 04', 'Mon Aug 04', 'Mon Aug 04', 'Mon Aug 04', 'Sun Aug 03', 'Sat Aug 02', 'Sat Aug 02', 'Sat Aug 02', 'Fri Aug 01', 'Fri Aug 01', 'Fri Aug 01', 'Fri Aug 01', 'Fri Aug 01', 'Fri Aug 01', 'Fri Aug 01', 'Thu Jul 31', 'Thu Jul 31', 'Thu Jul 31', 'Wed Jul 30', 'Tue Jul 29', 'Tue Jul 29', 'Tue Jul 29', 'Fri Jul 25', 'Fri Jul 25', 'Fri Jul 25', 'Fri Jul 25', 'Thu Jul 24', 'Thu Jul 24', 'Thu Jul 24', 'Wed Jul 23', 'Tue Jul 22', 'Mon Jul 21', 'Mon Jul 21', 'Mon Jul 21', 'Mon Jul 21', 'Mon Jul 21', 'Fri Jul 18', 'Fri Jul 18', 'Fri Jul 18', 'Fri Jul 18', 'Fri Jul 18', 'Fri Jul 18', 'Fri Jul 18', 'Fri Jul 18', 'Thu Jul 17', 'Thu Jul 17', 'Thu Jul 17', 'Thu Jul 17', 'Thu Jul 17', 'Wed Jul 16', 'Wed Jul 16', 'Wed Jul 16', 'Wed Jul 16', 'Wed Jul 16', 'Mon Jul 14', 'Mon Jul 14', 'Mon Jul 14', 'Mon Jul 14', 'Mon Jul 14', 'Mon Jul 14', 'Sun Jul 13', 'Sun Jul 13', 'Sun Jul 13', 'Sun Jul 13', 'Sun Jul 13', 'Thu Jul 10', 'Thu Jul 10', 'Tue Jul 08', 'Tue Jul 08', 'Tue Jul 08', 'Tue Jul 08', 'Mon Jul 07', 'Mon Jul 07', 'Mon Jul 07', 'Mon Jul 07', 'Mon Jul 07', 'Mon Jul 07', 'Mon Jul 07', 'Mon Jul 07', 'Thu Jul 03', 'Thu Jul 03', 'Thu Jul 03', 'Thu Jul 03', 'Thu Jul 03', 'Thu Jul 03', 'Wed Jul 02', 'Wed Jul 02', 'Wed Jul 02', 'Wed Jul 02', 'Wed Jul 02', 'Tue Jul 01', 'Tue Jul 01', 'Mon Jun 30', 'Mon Jun 30', 'Mon Jun 30', 'Mon Jun 30', 'Mon Jun 30', 'Mon Jun 30', 'Mon Jun 30', 'Mon Jun 30', 'Mon Jun 30', 'Mon Jun 30', 'Mon Jun 30', 'Mon Jun 30', 'Fri Jun 27', 'Fri Jun 27', 'Fri Jun 27', 'Fri Jun 27', 'Fri Jun 27', 'Fri Jun 27', 'Thu Jun 26', 'Thu Jun 26', 'Thu Jun 26', 'Thu Jun 26', 'Wed Jun 25', 'Tue Jun 24', 'Tue Jun 24', 'Tue Jun 24', 'Tue Jun 24', 'Tue Jun 24', 'Tue Jun 24', 'Mon Jun 23', 'Mon Jun 23', 'Fri Jun 20', 'Fri Jun 20', 'Fri Jun 20', 'Fri Jun 20', 'Fri Jun 20', 'Thu Jun 19', 'Thu Jun 19', 'Wed Jun 18', 'Wed Jun 18', 'Tue Jun 17', 'Tue Jun 17', 'Tue Jun 17', 'Tue Jun 17', 'Mon Jun 16', 'Mon Jun 16', 'Mon Jun 16', 'Mon Jun 16', 'Mon Jun 16', 'Fri Jun 13', 'Thu Jun 12', 'Thu Jun 12', 'Wed Jun 11', 'Wed Jun 11', 'Wed Jun 11', 'Wed Jun 11', 'Tue Jun 10', 'Tue Jun 10', 'Mon Jun 09', 'Fri Jun 06', 'Fri Jun 06', 'Tue Jun 03', 'Tue Jun 03', 'Tue Jun 03', 'Tue Jun 03', 'Tue Jun 03', 'Mon Jun 02', 'Fri May 30', 'Fri May 30', 'Thu May 29', 'Tue May 27', 'Tue May 27', 'Tue May 27', 'Fri May 23', 'Thu May 22', 'Thu May 22', 'Thu May 22', 'Wed May 21', 'Wed May 21', 'Tue May 20', 'Tue May 20', 'Tue May 20', 'Tue May 20', 'Mon May 19', 'Mon May 19', 'Fri May 16', 'Thu May 15', 'Thu May 15', 'Wed May 14', 'Tue May 13', 'Tue May 13', 'Mon May 12', 'Mon May 12', 'Fri May 09', 'Fri May 09', 'Wed May 07', 'Wed May 07', 'Wed May 07', 'Tue May 06', 'Tue May 06', 'Tue May 06', 'Tue May 06', 'Mon May 05', 'Mon May 05', 'Mon May 05', 'Mon May 05', 'Mon May 05', 'Sun May 04', 'Fri May 02', 'Thu May 01', 'Wed Apr 30', 'Wed Apr 30', 'Tue Apr 29', 'Tue Apr 29', 'Mon Apr 28', 'Mon Apr 28', 'Fri Apr 25', 'Thu Apr 24', 'Thu Apr 24', 'Wed Apr 23', 'Wed Apr 23', 'Wed Apr 23', 'Wed Apr 23', 'Fri Apr 18', 'Fri Apr 18', 'Fri Apr 18', 'Thu Apr 17', 'Thu Apr 17', 'Mon Apr 14', 'Mon Apr 14', 'Fri Apr 11', 'Thu Apr 10', 'Thu Apr 10', 'Thu Apr 10', 'Wed Apr 09', 'Wed Apr 09', 'Tue Apr 08', 'Tue Apr 08', 'Tue Apr 08', 'Tue Apr 08', 'Tue Apr 08', 'Fri Apr 04', 'Thu Apr 03', 'Thu Apr 03', 'Tue Apr 01', 'Tue Apr 01', 'Tue Apr 01', 'Tue Apr 01', 'Tue Apr 01', 'Mon Mar 31', 'Mon Mar 31', 'Mon Mar 31', 'Fri Mar 28', 'Thu Mar 27', 'Wed Mar 26', 'Wed Mar 26', 'Wed Mar 26', 'Wed Mar 26', 'Tue Mar 25', 'Tue Mar 25', 'Tue Mar 25', 'Mon Mar 24', 'Mon Mar 24', 'Mon Mar 24', 'Fri Mar 21', 'Thu Mar 20', 'Thu Mar 20', 'Wed Mar 19', 'Mon Mar 17', 'Mon Mar 17', 'Fri Mar 14', 'Fri Mar 14', 'Thu Mar 13', 'Thu Mar 13', 'Wed Mar 12', 'Wed Mar 12', 'Wed Mar 12', 'Wed Mar 12', 'Wed Mar 12', 'Wed Mar 12', 'Tue Mar 11', 'Tue Mar 11', 'Tue Mar 11', 'Tue Mar 11', 'Tue Mar 11', 'Tue Mar 11', 'Tue Mar 11', 'Tue Mar 11', 'Mon Mar 10', 'Thu Mar 06', 'Thu Mar 06', 'Tue Mar 04', 'Tue Mar 04', 'Mon Mar 03', 'Mon Mar 03', 'Mon Mar 03', 'Fri Feb 28', 'Thu Feb 27', 'Wed Feb 26', 'Wed Feb 26', 'Tue Feb 25', 'Tue Feb 25', 'Mon Feb 24', 'Mon Feb 24', 'Mon Feb 24', 'Mon Feb 24', 'Fri Feb 21', 'Fri Feb 21', 'Thu Feb 20', 'Thu Feb 20', 'Thu Feb 20', 'Wed Feb 19', 'Tue Feb 18', 'Mon Feb 17', 'Fri Feb 14', 'Thu Feb 13', 'Thu Feb 13', 'Wed Feb 12', 'Wed Feb 12', 'Tue Feb 11', 'Mon Feb 10', 'Mon Feb 10', 'Fri Feb 07', 'Fri Feb 07', 'Thu Feb 06', 'Tue Feb 04', 'Tue Feb 04', 'Tue Feb 04', 'Tue Feb 04', 'Mon Feb 03', 'Mon Feb 03', 'Mon Feb 03', 'Mon Feb 03', 'Fri Jan 31', 'Tue Jan 14', 'Tue Jan 14', 'Tue Jan 14', 'Mon Jan 13', 'Mon Jan 13', 'Fri Jan 10', 'Fri Jan 10', 'Thu Jan 09', 'Thu Jan 09', 'Thu Jan 09', 'Thu Jan 09', 'Wed Jan 08', 'Wed Jan 08', 'Tue Jan 07', 'Tue Jan 07', 'Thu Jan 02', 'Thu Jan 02', 'Thu Dec 26', 'Tue Dec 24', 'Tue Dec 24', 'Mon Dec 23', 'Mon Dec 23', 'Mon Dec 23', 'Fri Dec 20', 'Fri Dec 20', 'Fri Dec 20', 'Thu Dec 19', 'Wed Dec 18', 'Wed Dec 18', 'Tue Dec 17', 'Mon Dec 16', 'Mon Dec 16', 'Fri Dec 13', 'Fri Dec 13', 'Thu Dec 12', 'Thu Dec 12', 'Wed Dec 11', 'Wed Dec 11', 'Wed Dec 11', 'Tue Dec 10', 'Tue Dec 10', 'Tue Dec 10', 'Mon Dec 09', 'Mon Dec 09', 'Mon Dec 09', 'Fri Dec 06', 'Thu Dec 05', 'Thu Dec 05', 'Thu Dec 05', 'Wed Dec 04', 'Wed Dec 04', 'Wed Dec 04', 'Tue Dec 03', 'Tue Dec 03', 'Tue Dec 03', 'Tue Dec 03', 'Tue Dec 03', 'Mon Dec 02', 'Mon Dec 02', 'Mon Dec 02', 'Mon Dec 02', 'Mon Dec 02', 'Fri Nov 29', 'Wed Nov 27', 'Wed Nov 27', 'Wed Nov 27', 'Wed Nov 27', 'Mon Nov 25', 'Thu Nov 21', 'Thu Nov 21', 'Thu Nov 21', 'Thu Nov 21', 'Thu Nov 21', 'Wed Nov 20', 'Tue Nov 19', 'Tue Nov 19', 'Tue Nov 19', 'Mon Nov 18', 'Fri Nov 15', 'Fri Nov 15', 'Thu Nov 14', 'Thu Nov 14', 'Thu Nov 14', 'Wed Nov 13', 'Tue Nov 12', 'Tue Nov 12', 'Mon Nov 11', 'Mon Nov 11', 'Mon Nov 11', 'Mon Nov 11', 'Thu Nov 07', 'Thu Nov 07', 'Wed Nov 06', 'Wed Nov 06', 'Tue Nov 05', 'Tue Nov 05', 'Tue Nov 05', 'Tue Nov 05', 'Mon Nov 04', 'Mon Nov 04', 'Fri Nov 01', 'Fri Nov 01', 'Thu Oct 31', 'Wed Oct 30', 'Tue Oct 29', 'Tue Oct 29', 'Tue Oct 29', 'Tue Oct 29', 'Mon Oct 28', 'Mon Oct 28', 'Mon Oct 28', 'Fri Oct 25', 'Fri Oct 25', 'Thu Oct 24', 'Thu Oct 24', 'Wed Oct 23', 'Tue Oct 22', 'Tue Oct 22', 'Tue Oct 22', 'Mon Oct 21', 'Mon Oct 21', 'Mon Oct 21', 'Fri Oct 18', 'Fri Oct 18', 'Thu Oct 17', 'Thu Oct 17', 'Wed Oct 16', 'Wed Oct 16', 'Wed Oct 16', 'Wed Oct 16', 'Wed Oct 16', 'Wed Oct 16', 'Wed Oct 16', 'Tue Oct 15', 'Tue Oct 15', 'Tue Oct 15', 'Tue Oct 15', 'Mon Oct 14', 'Mon Oct 14', 'Sun Oct 13', 'Wed Oct 09', 'Wed Oct 09', 'Tue Oct 08', 'Tue Oct 08', 'Thu Oct 03', 'Thu Oct 03', 'Thu Oct 03', 'Wed Oct 02', 'Wed Oct 02', 'Wed Oct 02', 'Tue Oct 01', 'Tue Oct 01', 'Mon Sep 30', 'Mon Sep 30', 'Mon Sep 23', 'Mon Sep 23', 'Wed Sep 18', 'Wed Sep 18', 'Tue Sep 17', 'Tue Sep 17', 'Tue Sep 17', 'Wed Sep 11', 'Wed Sep 11', 'Wed Sep 11', 'Wed Sep 11', 'Tue Sep 10', 'Tue Sep 10', 'Tue Sep 10', 'Tue Sep 10', 'Wed Sep 04', 'Tue Sep 03', 'Tue Sep 03', 'Fri Aug 30', 'Fri Aug 30', 'Fri Aug 30', 'Fri Aug 30', 'Fri Aug 30', 'Thu Aug 29', 'Wed Aug 28', 'Wed Aug 28', 'Tue Aug 27', 'Mon Aug 26', 'Mon Aug 26', 'Mon Aug 26', 'Fri Aug 23', 'Fri Aug 23', 'Fri Aug 23', 'Wed Aug 21', 'Wed Aug 21', 'Wed Aug 21', 'Tue Aug 20', 'Tue Aug 20', 'Mon Aug 19', 'Mon Aug 19', 'Mon Aug 19', 'Mon Aug 19', 'Sat Aug 17', 'Fri Aug 16', 'Fri Aug 16', 'Fri Aug 16', 'Thu Aug 15', 'Tue Aug 13', 'Mon Aug 12', 'Fri Aug 09', 'Thu Aug 08', 'Thu Aug 08', 'Thu Aug 08', 'Wed Aug 07', 'Wed Aug 07', 'Wed Aug 07', 'Wed Aug 07', 'Wed Aug 07', 'Tue Aug 06', 'Tue Aug 06', 'Tue Aug 06', 'Mon Aug 05', 'Mon Aug 05', 'Mon Aug 05', 'Fri Aug 02', 'Fri Aug 02', 'Thu Aug 01', 'Thu Aug 01', 'Thu Aug 01', 'Thu Aug 01', 'Wed Jul 31', 'Tue Jul 30', 'Tue Jul 30', 'Fri Jul 26', 'Thu Jul 25', 'Thu Jul 25', 'Thu Jul 25', 'Wed Jul 24', 'Wed Jul 24', 'Tue Jul 23', 'Tue Jul 23', 'Mon Jul 22', 'Fri Jul 19', 'Thu Jul 18', 'Thu Jul 18', 'Wed Jul 17', 'Wed Jul 17', 'Wed Jul 17', 'Wed Jul 17', 'Wed Jul 17', 'Tue Jul 16', 'Tue Jul 16', 'Tue Jul 16', 'Tue Jul 16', 'Tue Jul 16', 'Mon Jul 15', 'Mon Jul 15', 'Mon Jul 15', 'Mon Jul 15', 'Fri Jul 12', 'Thu Jul 11', 'Thu Jul 11', 'Wed Jul 10', 'Wed Jul 10', 'Wed Jul 10', 'Wed Jul 10', 'Wed Jul 10', 'Tue Jul 09', 'Tue Jul 09', 'Tue Jul 09', 'Mon Jul 08', 'Mon Jul 08', 'Tue Jul 02', 'Tue Jul 02', 'Tue Jul 02', 'Tue Jul 02', 'Tue Jul 02', 'Tue Jul 02', 'Mon Jul 01', 'Fri Jun 28', 'Thu Jun 27', 'Thu Jun 27', 'Wed Jun 26', 'Wed Jun 26', 'Tue Jun 25', 'Tue Jun 25', 'Tue Jun 25', 'Tue Jun 25', 'Thu Jun 20', 'Wed Jun 19', 'Tue Jun 18', 'Tue Jun 18', 'Tue Jun 18', 'Tue Jun 18', 'Tue Jun 18', 'Tue Jun 18', 'Tue Jun 18', 'Tue Jun 18', 'Fri Jun 14', 'Fri Jun 14', 'Wed Jun 12', 'Wed Jun 12', 'Tue Jun 11', 'Tue Jun 11', 'Tue Jun 11', 'Thu Jun 06', 'Wed Jun 05', 'Wed Jun 05', 'Tue Jun 04', 'Tue Jun 04', 'Mon Jun 03', 'Mon Jun 03', 'Fri May 31', 'Fri May 31', 'Thu May 30', 'Thu May 30', 'Wed May 29', 'Wed May 29', 'Wed May 29', 'Tue May 28', 'Tue May 28', 'Wed May 22', 'Wed May 22', 'Tue May 21', 'Tue May 21', 'Tue May 21', 'Tue May 21', 'Fri May 17', 'Fri May 17', 'Tue May 14', 'Tue May 14', 'Tue May 14', 'Tue May 14', 'Tue May 14', 'Sat May 11', 'Fri May 10', 'Fri May 10', 'Fri May 10', 'Thu May 09', 'Wed May 08', 'Wed May 08', 'Tue May 07', 'Tue May 07', 'Tue May 07', 'Mon May 06', 'Mon May 06', 'Sat May 04', 'Thu May 02', 'Wed May 01', 'Wed May 01', 'Wed May 01', 'Wed May 01', 'Tue Apr 30', 'Fri Apr 26', 'Fri Apr 26', 'Thu Apr 25', 'Thu Apr 25', 'Wed Apr 24', 'Tue Apr 23', 'Tue Apr 23', 'Tue Apr 23', 'Mon Apr 22', 'Fri Apr 19', 'Thu Apr 18', 'Thu Apr 18', 'Wed Apr 17', 'Wed Apr 17', 'Tue Apr 16', 'Thu Apr 11', 'Wed Apr 10', 'Wed Apr 10', 'Mon Apr 08', 'Mon Apr 08', 'Fri Apr 05', 'Fri Apr 05', 'Thu Apr 04', 'Wed Apr 03', 'Wed Apr 03', 'Wed Apr 03', 'Wed Apr 03', 'Tue Apr 02', 'Tue Apr 02']

Nested Repetition Quantifiers

text_doi = "doi:10.1038/nphys1170, doi:10.1002/0470841559.ch1, DOI:10.1093/bib/bbab224"

re.findall("DOI|doi", text_doi)
## ['doi', 'doi', 'DOI']
re.findall("[Dd][Oo][Ii]:\d+", text_doi)
## ['doi:10', 'doi:10', 'DOI:10']
re.findall("(?:DOI|doi):\d+", text_doi)
## ['doi:10', 'doi:10', 'DOI:10']

how to match the entire doi pattern?

text_doi = "doi:10.1038/nphys1170, doi:10.1002/0470841559.ch1, DOI:10.1093/bib/bbab224"

re.findall("(?:DOI|doi):\d+", text_doi)
## ['doi:10', 'doi:10', 'DOI:10']
re.findall("(?:DOI|doi):\d+\.\d+/(?:\w+\.?)+", text_doi)
## ['doi:10.1038/nphys1170', 'doi:10.1002/0470841559.ch1', 'DOI:10.1093/bib']
re.findall("(?:DOI|doi):\d+\.\d+(?:/(?:\w+\.?)+)+", text_doi)
## ['doi:10.1038/nphys1170', 'doi:10.1002/0470841559.ch1', 'DOI:10.1093/bib/bbab224']

Reference